aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2017-03-27 05:33:03 -0400
committerIngo Molnar <mingo@kernel.org>2017-03-28 02:55:01 -0400
commit5de97c9f6d85fd83af76e09e338b18e7adb1ae60 (patch)
tree22c9624f8d825df19c4ff1bbec04abfd59c1ac72
parent011d8261117249eab97bc86a8e1ac7731e03e319 (diff)
x86/mce: Factor out and deprecate the /dev/mcelog driver
Move all code relating to /dev/mcelog to a separate source file. /dev/mcelog driver can now operate from the machine check notifier with lowest prio. Signed-off-by: Tony Luck <tony.luck@intel.com> [ Move the mce_helper and trigger functionality behind CONFIG_X86_MCELOG_LEGACY. ] Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20170327093304.10683-6-bp@alien8.de [ Renamed CONFIG_X86_MCELOG to CONFIG_X86_MCELOG_LEGACY. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c397
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c372
6 files changed, 426 insertions, 364 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..43e6bac6b950 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1043,6 +1043,14 @@ config X86_MCE
1043 The action the kernel takes depends on the severity of the problem, 1043 The action the kernel takes depends on the severity of the problem,
1044 ranging from warning messages to halting the machine. 1044 ranging from warning messages to halting the machine.
1045 1045
1046config X86_MCELOG_LEGACY
1047 bool "Support for deprecated /dev/mcelog character device"
1048 depends on X86_MCE
1049 ---help---
1050 Enable support for /dev/mcelog which is needed by the old mcelog
1051 userspace logging daemon. Consider switching to the new generation
1052 rasdaemon solution.
1053
1046config X86_MCE_INTEL 1054config X86_MCE_INTEL
1047 def_bool y 1055 def_bool y
1048 prompt "Intel MCE features" 1056 prompt "Intel MCE features"
@@ -1072,7 +1080,7 @@ config X86_MCE_THRESHOLD
1072 def_bool y 1080 def_bool y
1073 1081
1074config X86_MCE_INJECT 1082config X86_MCE_INJECT
1075 depends on X86_MCE && X86_LOCAL_APIC 1083 depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
1076 tristate "Machine check injector support" 1084 tristate "Machine check injector support"
1077 ---help--- 1085 ---help---
1078 Provide support for injecting machine checks for testing purposes. 1086 Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c5ae545d27d8..4fd5195deed0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -196,6 +196,7 @@ enum mce_notifier_prios {
196 MCE_PRIO_EXTLOG = INT_MAX - 2, 196 MCE_PRIO_EXTLOG = INT_MAX - 2,
197 MCE_PRIO_NFIT = INT_MAX - 3, 197 MCE_PRIO_NFIT = INT_MAX - 3,
198 MCE_PRIO_EDAC = INT_MAX - 4, 198 MCE_PRIO_EDAC = INT_MAX - 4,
199 MCE_PRIO_MCELOG = 1,
199 MCE_PRIO_LOWEST = 0, 200 MCE_PRIO_LOWEST = 0,
200}; 201};
201 202
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..43051f0777d4 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10 10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o 11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
12
13obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
1/*
2 * /dev/mcelog driver
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/miscdevice.h>
14#include <linux/slab.h>
15#include <linux/kmod.h>
16#include <linux/poll.h>
17
18#include "mce-internal.h"
19
20static DEFINE_MUTEX(mce_chrdev_read_mutex);
21
22static char mce_helper[128];
23static char *mce_helper_argv[2] = { mce_helper, NULL };
24
25#define mce_log_get_idx_check(p) \
26({ \
27 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
28 !lockdep_is_held(&mce_chrdev_read_mutex), \
29 "suspicious mce_log_get_idx_check() usage"); \
30 smp_load_acquire(&(p)); \
31})
32
33/*
34 * Lockless MCE logging infrastructure.
35 * This avoids deadlocks on printk locks without having to break locks. Also
36 * separate MCEs from kernel messages to avoid bogus bug reports.
37 */
38
39static struct mce_log_buffer mcelog = {
40 .signature = MCE_LOG_SIGNATURE,
41 .len = MCE_LOG_LEN,
42 .recordlen = sizeof(struct mce),
43};
44
45static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
46
47/* User mode helper program triggered by machine check event */
48extern char mce_helper[128];
49
50static int dev_mce_log(struct notifier_block *nb, unsigned long val,
51 void *data)
52{
53 struct mce *mce = (struct mce *)data;
54 unsigned int next, entry;
55
56 wmb();
57 for (;;) {
58 entry = mce_log_get_idx_check(mcelog.next);
59 for (;;) {
60
61 /*
62 * When the buffer fills up discard new entries.
63 * Assume that the earlier errors are the more
64 * interesting ones:
65 */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW,
68 (unsigned long *)&mcelog.flags);
69 return NOTIFY_OK;
70 }
71 /* Old left over entry. Skip: */
72 if (mcelog.entry[entry].finished) {
73 entry++;
74 continue;
75 }
76 break;
77 }
78 smp_rmb();
79 next = entry + 1;
80 if (cmpxchg(&mcelog.next, entry, next) == entry)
81 break;
82 }
83 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
84 wmb();
85 mcelog.entry[entry].finished = 1;
86 wmb();
87
88 /* wake processes polling /dev/mcelog */
89 wake_up_interruptible(&mce_chrdev_wait);
90
91 return NOTIFY_OK;
92}
93
94static struct notifier_block dev_mcelog_nb = {
95 .notifier_call = dev_mce_log,
96 .priority = MCE_PRIO_MCELOG,
97};
98
99static void mce_do_trigger(struct work_struct *work)
100{
101 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
102}
103
104static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
105
106
107void mce_work_trigger(void)
108{
109 if (mce_helper[0])
110 schedule_work(&mce_trigger_work);
111}
112
113static ssize_t
114show_trigger(struct device *s, struct device_attribute *attr, char *buf)
115{
116 strcpy(buf, mce_helper);
117 strcat(buf, "\n");
118 return strlen(mce_helper) + 1;
119}
120
121static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
122 const char *buf, size_t siz)
123{
124 char *p;
125
126 strncpy(mce_helper, buf, sizeof(mce_helper));
127 mce_helper[sizeof(mce_helper)-1] = 0;
128 p = strchr(mce_helper, '\n');
129
130 if (p)
131 *p = 0;
132
133 return strlen(mce_helper) + !!p;
134}
135
136DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
137
138/*
139 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
140 */
141
142static DEFINE_SPINLOCK(mce_chrdev_state_lock);
143static int mce_chrdev_open_count; /* #times opened */
144static int mce_chrdev_open_exclu; /* already open exclusive? */
145
146static int mce_chrdev_open(struct inode *inode, struct file *file)
147{
148 spin_lock(&mce_chrdev_state_lock);
149
150 if (mce_chrdev_open_exclu ||
151 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
152 spin_unlock(&mce_chrdev_state_lock);
153
154 return -EBUSY;
155 }
156
157 if (file->f_flags & O_EXCL)
158 mce_chrdev_open_exclu = 1;
159 mce_chrdev_open_count++;
160
161 spin_unlock(&mce_chrdev_state_lock);
162
163 return nonseekable_open(inode, file);
164}
165
166static int mce_chrdev_release(struct inode *inode, struct file *file)
167{
168 spin_lock(&mce_chrdev_state_lock);
169
170 mce_chrdev_open_count--;
171 mce_chrdev_open_exclu = 0;
172
173 spin_unlock(&mce_chrdev_state_lock);
174
175 return 0;
176}
177
178static void collect_tscs(void *data)
179{
180 unsigned long *cpu_tsc = (unsigned long *)data;
181
182 cpu_tsc[smp_processor_id()] = rdtsc();
183}
184
185static int mce_apei_read_done;
186
187/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
188static int __mce_read_apei(char __user **ubuf, size_t usize)
189{
190 int rc;
191 u64 record_id;
192 struct mce m;
193
194 if (usize < sizeof(struct mce))
195 return -EINVAL;
196
197 rc = apei_read_mce(&m, &record_id);
198 /* Error or no more MCE record */
199 if (rc <= 0) {
200 mce_apei_read_done = 1;
201 /*
202 * When ERST is disabled, mce_chrdev_read() should return
203 * "no record" instead of "no device."
204 */
205 if (rc == -ENODEV)
206 return 0;
207 return rc;
208 }
209 rc = -EFAULT;
210 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
211 return rc;
212 /*
213 * In fact, we should have cleared the record after that has
214 * been flushed to the disk or sent to network in
215 * /sbin/mcelog, but we have no interface to support that now,
216 * so just clear it to avoid duplication.
217 */
218 rc = apei_clear_mce(record_id);
219 if (rc) {
220 mce_apei_read_done = 1;
221 return rc;
222 }
223 *ubuf += sizeof(struct mce);
224
225 return 0;
226}
227
228static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
229 size_t usize, loff_t *off)
230{
231 char __user *buf = ubuf;
232 unsigned long *cpu_tsc;
233 unsigned prev, next;
234 int i, err;
235
236 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
237 if (!cpu_tsc)
238 return -ENOMEM;
239
240 mutex_lock(&mce_chrdev_read_mutex);
241
242 if (!mce_apei_read_done) {
243 err = __mce_read_apei(&buf, usize);
244 if (err || buf != ubuf)
245 goto out;
246 }
247
248 next = mce_log_get_idx_check(mcelog.next);
249
250 /* Only supports full reads right now */
251 err = -EINVAL;
252 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
253 goto out;
254
255 err = 0;
256 prev = 0;
257 do {
258 for (i = prev; i < next; i++) {
259 unsigned long start = jiffies;
260 struct mce *m = &mcelog.entry[i];
261
262 while (!m->finished) {
263 if (time_after_eq(jiffies, start + 2)) {
264 memset(m, 0, sizeof(*m));
265 goto timeout;
266 }
267 cpu_relax();
268 }
269 smp_rmb();
270 err |= copy_to_user(buf, m, sizeof(*m));
271 buf += sizeof(*m);
272timeout:
273 ;
274 }
275
276 memset(mcelog.entry + prev, 0,
277 (next - prev) * sizeof(struct mce));
278 prev = next;
279 next = cmpxchg(&mcelog.next, prev, 0);
280 } while (next != prev);
281
282 synchronize_sched();
283
284 /*
285 * Collect entries that were still getting written before the
286 * synchronize.
287 */
288 on_each_cpu(collect_tscs, cpu_tsc, 1);
289
290 for (i = next; i < MCE_LOG_LEN; i++) {
291 struct mce *m = &mcelog.entry[i];
292
293 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
294 err |= copy_to_user(buf, m, sizeof(*m));
295 smp_rmb();
296 buf += sizeof(*m);
297 memset(m, 0, sizeof(*m));
298 }
299 }
300
301 if (err)
302 err = -EFAULT;
303
304out:
305 mutex_unlock(&mce_chrdev_read_mutex);
306 kfree(cpu_tsc);
307
308 return err ? err : buf - ubuf;
309}
310
311static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
312{
313 poll_wait(file, &mce_chrdev_wait, wait);
314 if (READ_ONCE(mcelog.next))
315 return POLLIN | POLLRDNORM;
316 if (!mce_apei_read_done && apei_check_mce())
317 return POLLIN | POLLRDNORM;
318 return 0;
319}
320
321static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
322 unsigned long arg)
323{
324 int __user *p = (int __user *)arg;
325
326 if (!capable(CAP_SYS_ADMIN))
327 return -EPERM;
328
329 switch (cmd) {
330 case MCE_GET_RECORD_LEN:
331 return put_user(sizeof(struct mce), p);
332 case MCE_GET_LOG_LEN:
333 return put_user(MCE_LOG_LEN, p);
334 case MCE_GETCLEAR_FLAGS: {
335 unsigned flags;
336
337 do {
338 flags = mcelog.flags;
339 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
340
341 return put_user(flags, p);
342 }
343 default:
344 return -ENOTTY;
345 }
346}
347
348static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
349 size_t usize, loff_t *off);
350
351void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
352 const char __user *ubuf,
353 size_t usize, loff_t *off))
354{
355 mce_write = fn;
356}
357EXPORT_SYMBOL_GPL(register_mce_write_callback);
358
359static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
360 size_t usize, loff_t *off)
361{
362 if (mce_write)
363 return mce_write(filp, ubuf, usize, off);
364 else
365 return -EINVAL;
366}
367
368static const struct file_operations mce_chrdev_ops = {
369 .open = mce_chrdev_open,
370 .release = mce_chrdev_release,
371 .read = mce_chrdev_read,
372 .write = mce_chrdev_write,
373 .poll = mce_chrdev_poll,
374 .unlocked_ioctl = mce_chrdev_ioctl,
375 .llseek = no_llseek,
376};
377
378static struct miscdevice mce_chrdev_device = {
379 MISC_MCELOG_MINOR,
380 "mcelog",
381 &mce_chrdev_ops,
382};
383
384static __init int dev_mcelog_init_device(void)
385{
386 int err;
387
388 /* register character device /dev/mcelog */
389 err = misc_register(&mce_chrdev_device);
390 if (err) {
391 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
392 return err;
393 }
394 mce_register_decode_chain(&dev_mcelog_nb);
395 return 0;
396}
397device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 903043e6a62b..7f2a7c54391f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
96 m1->addr != m2->addr || 96 m1->addr != m2->addr ||
97 m1->misc != m2->misc; 97 m1->misc != m2->misc;
98} 98}
99
100extern struct device_attribute dev_attr_trigger;
101
102#ifdef CONFIG_X86_MCELOG_LEGACY
103extern void mce_work_trigger(void);
104#else
105static inline void mce_work_trigger(void) { }
106#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4a907758a516..36082c7fe4a0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,17 +54,7 @@
54 54
55#include "mce-internal.h" 55#include "mce-internal.h"
56 56
57static DEFINE_MUTEX(mce_chrdev_read_mutex); 57static DEFINE_MUTEX(mce_log_mutex);
58
59static int mce_chrdev_open_count; /* #times opened */
60
61#define mce_log_get_idx_check(p) \
62({ \
63 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
64 !lockdep_is_held(&mce_chrdev_read_mutex), \
65 "suspicious mce_log_get_idx_check() usage"); \
66 smp_load_acquire(&(p)); \
67})
68 58
69#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
70#include <trace/events/mce.h> 60#include <trace/events/mce.h>
@@ -89,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
89 .monarch_timeout = -1 79 .monarch_timeout = -1
90}; 80};
91 81
92/* User mode helper program triggered by machine check event */
93static unsigned long mce_need_notify;
94static char mce_helper[128];
95static char *mce_helper_argv[2] = { mce_helper, NULL };
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
98
99static DEFINE_PER_CPU(struct mce, mces_seen); 82static DEFINE_PER_CPU(struct mce, mces_seen);
100static int cpu_missing; 83static unsigned long mce_need_notify;
84static int cpu_missing;
101 85
102/* 86/*
103 * MCA banks polled by the period polling timer for corrected events. 87 * MCA banks polled by the period polling timer for corrected events.
@@ -147,18 +131,6 @@ void mce_setup(struct mce *m)
147DEFINE_PER_CPU(struct mce, injectm); 131DEFINE_PER_CPU(struct mce, injectm);
148EXPORT_PER_CPU_SYMBOL_GPL(injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
149 133
150/*
151 * Lockless MCE logging infrastructure.
152 * This avoids deadlocks on printk locks without having to break locks. Also
153 * separate MCEs from kernel messages to avoid bogus bug reports.
154 */
155
156static struct mce_log_buffer mcelog_buf = {
157 .signature = MCE_LOG_SIGNATURE,
158 .len = MCE_LOG_LEN,
159 .recordlen = sizeof(struct mce),
160};
161
162void mce_log(struct mce *m) 134void mce_log(struct mce *m)
163{ 135{
164 if (!mce_gen_pool_add(m)) 136 if (!mce_gen_pool_add(m))
@@ -167,9 +139,9 @@ void mce_log(struct mce *m)
167 139
168void mce_inject_log(struct mce *m) 140void mce_inject_log(struct mce *m)
169{ 141{
170 mutex_lock(&mce_chrdev_read_mutex); 142 mutex_lock(&mce_log_mutex);
171 mce_log(m); 143 mce_log(m);
172 mutex_unlock(&mce_chrdev_read_mutex); 144 mutex_unlock(&mce_log_mutex);
173} 145}
174EXPORT_SYMBOL_GPL(mce_inject_log); 146EXPORT_SYMBOL_GPL(mce_inject_log);
175 147
@@ -582,7 +554,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
582 void *data) 554 void *data)
583{ 555{
584 struct mce *m = (struct mce *)data; 556 struct mce *m = (struct mce *)data;
585 unsigned int next, entry;
586 557
587 if (!m) 558 if (!m)
588 return NOTIFY_DONE; 559 return NOTIFY_DONE;
@@ -593,38 +564,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
593 /* Emit the trace record: */ 564 /* Emit the trace record: */
594 trace_mce_record(m); 565 trace_mce_record(m);
595 566
596 wmb();
597 for (;;) {
598 entry = mce_log_get_idx_check(mcelog_buf.next);
599 for (;;) {
600
601 /*
602 * When the buffer fills up discard new entries.
603 * Assume that the earlier errors are the more
604 * interesting ones:
605 */
606 if (entry >= MCE_LOG_LEN) {
607 set_bit(MCE_OVERFLOW,
608 (unsigned long *)&mcelog_buf.flags);
609 return NOTIFY_DONE;
610 }
611 /* Old left over entry. Skip: */
612 if (mcelog_buf.entry[entry].finished) {
613 entry++;
614 continue;
615 }
616 break;
617 }
618 smp_rmb();
619 next = entry + 1;
620 if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
621 break;
622 }
623 memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
624 wmb();
625 mcelog_buf.entry[entry].finished = 1;
626 wmb();
627
628 set_bit(0, &mce_need_notify); 567 set_bit(0, &mce_need_notify);
629 568
630 mce_notify_irq(); 569 mce_notify_irq();
@@ -669,10 +608,6 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
669 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) 608 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
670 return NOTIFY_DONE; 609 return NOTIFY_DONE;
671 610
672 /* Don't print when mcelog is running */
673 if (mce_chrdev_open_count > 0)
674 return NOTIFY_DONE;
675
676 __print_mce(m); 611 __print_mce(m);
677 612
678 return NOTIFY_DONE; 613 return NOTIFY_DONE;
@@ -1456,13 +1391,6 @@ static void mce_timer_delete_all(void)
1456 del_timer_sync(&per_cpu(mce_timer, cpu)); 1391 del_timer_sync(&per_cpu(mce_timer, cpu));
1457} 1392}
1458 1393
1459static void mce_do_trigger(struct work_struct *work)
1460{
1461 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1462}
1463
1464static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1465
1466/* 1394/*
1467 * Notify the user(s) about new machine check events. 1395 * Notify the user(s) about new machine check events.
1468 * Can be called from interrupt context, but not from machine check/NMI 1396 * Can be called from interrupt context, but not from machine check/NMI
@@ -1474,11 +1402,7 @@ int mce_notify_irq(void)
1474 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1402 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1475 1403
1476 if (test_and_clear_bit(0, &mce_need_notify)) { 1404 if (test_and_clear_bit(0, &mce_need_notify)) {
1477 /* wake processes polling /dev/mcelog */ 1405 mce_work_trigger();
1478 wake_up_interruptible(&mce_chrdev_wait);
1479
1480 if (mce_helper[0])
1481 schedule_work(&mce_trigger_work);
1482 1406
1483 if (__ratelimit(&ratelimit)) 1407 if (__ratelimit(&ratelimit))
1484 pr_info(HW_ERR "Machine check events logged\n"); 1408 pr_info(HW_ERR "Machine check events logged\n");
@@ -1886,251 +1810,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1886 1810
1887} 1811}
1888 1812
1889/*
1890 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1891 */
1892
1893static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1894static int mce_chrdev_open_exclu; /* already open exclusive? */
1895
1896static int mce_chrdev_open(struct inode *inode, struct file *file)
1897{
1898 spin_lock(&mce_chrdev_state_lock);
1899
1900 if (mce_chrdev_open_exclu ||
1901 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1902 spin_unlock(&mce_chrdev_state_lock);
1903
1904 return -EBUSY;
1905 }
1906
1907 if (file->f_flags & O_EXCL)
1908 mce_chrdev_open_exclu = 1;
1909 mce_chrdev_open_count++;
1910
1911 spin_unlock(&mce_chrdev_state_lock);
1912
1913 return nonseekable_open(inode, file);
1914}
1915
1916static int mce_chrdev_release(struct inode *inode, struct file *file)
1917{
1918 spin_lock(&mce_chrdev_state_lock);
1919
1920 mce_chrdev_open_count--;
1921 mce_chrdev_open_exclu = 0;
1922
1923 spin_unlock(&mce_chrdev_state_lock);
1924
1925 return 0;
1926}
1927
1928static void collect_tscs(void *data)
1929{
1930 unsigned long *cpu_tsc = (unsigned long *)data;
1931
1932 cpu_tsc[smp_processor_id()] = rdtsc();
1933}
1934
1935static int mce_apei_read_done;
1936
1937/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1938static int __mce_read_apei(char __user **ubuf, size_t usize)
1939{
1940 int rc;
1941 u64 record_id;
1942 struct mce m;
1943
1944 if (usize < sizeof(struct mce))
1945 return -EINVAL;
1946
1947 rc = apei_read_mce(&m, &record_id);
1948 /* Error or no more MCE record */
1949 if (rc <= 0) {
1950 mce_apei_read_done = 1;
1951 /*
1952 * When ERST is disabled, mce_chrdev_read() should return
1953 * "no record" instead of "no device."
1954 */
1955 if (rc == -ENODEV)
1956 return 0;
1957 return rc;
1958 }
1959 rc = -EFAULT;
1960 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1961 return rc;
1962 /*
1963 * In fact, we should have cleared the record after that has
1964 * been flushed to the disk or sent to network in
1965 * /sbin/mcelog, but we have no interface to support that now,
1966 * so just clear it to avoid duplication.
1967 */
1968 rc = apei_clear_mce(record_id);
1969 if (rc) {
1970 mce_apei_read_done = 1;
1971 return rc;
1972 }
1973 *ubuf += sizeof(struct mce);
1974
1975 return 0;
1976}
1977
1978static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1979 size_t usize, loff_t *off)
1980{
1981 char __user *buf = ubuf;
1982 unsigned long *cpu_tsc;
1983 unsigned prev, next;
1984 int i, err;
1985
1986 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1987 if (!cpu_tsc)
1988 return -ENOMEM;
1989
1990 mutex_lock(&mce_chrdev_read_mutex);
1991
1992 if (!mce_apei_read_done) {
1993 err = __mce_read_apei(&buf, usize);
1994 if (err || buf != ubuf)
1995 goto out;
1996 }
1997
1998 next = mce_log_get_idx_check(mcelog_buf.next);
1999
2000 /* Only supports full reads right now */
2001 err = -EINVAL;
2002 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
2003 goto out;
2004
2005 err = 0;
2006 prev = 0;
2007 do {
2008 for (i = prev; i < next; i++) {
2009 unsigned long start = jiffies;
2010 struct mce *m = &mcelog_buf.entry[i];
2011
2012 while (!m->finished) {
2013 if (time_after_eq(jiffies, start + 2)) {
2014 memset(m, 0, sizeof(*m));
2015 goto timeout;
2016 }
2017 cpu_relax();
2018 }
2019 smp_rmb();
2020 err |= copy_to_user(buf, m, sizeof(*m));
2021 buf += sizeof(*m);
2022timeout:
2023 ;
2024 }
2025
2026 memset(mcelog_buf.entry + prev, 0,
2027 (next - prev) * sizeof(struct mce));
2028 prev = next;
2029 next = cmpxchg(&mcelog_buf.next, prev, 0);
2030 } while (next != prev);
2031
2032 synchronize_sched();
2033
2034 /*
2035 * Collect entries that were still getting written before the
2036 * synchronize.
2037 */
2038 on_each_cpu(collect_tscs, cpu_tsc, 1);
2039
2040 for (i = next; i < MCE_LOG_LEN; i++) {
2041 struct mce *m = &mcelog_buf.entry[i];
2042
2043 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
2044 err |= copy_to_user(buf, m, sizeof(*m));
2045 smp_rmb();
2046 buf += sizeof(*m);
2047 memset(m, 0, sizeof(*m));
2048 }
2049 }
2050
2051 if (err)
2052 err = -EFAULT;
2053
2054out:
2055 mutex_unlock(&mce_chrdev_read_mutex);
2056 kfree(cpu_tsc);
2057
2058 return err ? err : buf - ubuf;
2059}
2060
2061static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
2062{
2063 poll_wait(file, &mce_chrdev_wait, wait);
2064 if (READ_ONCE(mcelog_buf.next))
2065 return POLLIN | POLLRDNORM;
2066 if (!mce_apei_read_done && apei_check_mce())
2067 return POLLIN | POLLRDNORM;
2068 return 0;
2069}
2070
2071static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2072 unsigned long arg)
2073{
2074 int __user *p = (int __user *)arg;
2075
2076 if (!capable(CAP_SYS_ADMIN))
2077 return -EPERM;
2078
2079 switch (cmd) {
2080 case MCE_GET_RECORD_LEN:
2081 return put_user(sizeof(struct mce), p);
2082 case MCE_GET_LOG_LEN:
2083 return put_user(MCE_LOG_LEN, p);
2084 case MCE_GETCLEAR_FLAGS: {
2085 unsigned flags;
2086
2087 do {
2088 flags = mcelog_buf.flags;
2089 } while (cmpxchg(&mcelog_buf.flags, flags, 0) != flags);
2090
2091 return put_user(flags, p);
2092 }
2093 default:
2094 return -ENOTTY;
2095 }
2096}
2097
2098static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2099 size_t usize, loff_t *off);
2100
2101void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2102 const char __user *ubuf,
2103 size_t usize, loff_t *off))
2104{
2105 mce_write = fn;
2106}
2107EXPORT_SYMBOL_GPL(register_mce_write_callback);
2108
2109static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2110 size_t usize, loff_t *off)
2111{
2112 if (mce_write)
2113 return mce_write(filp, ubuf, usize, off);
2114 else
2115 return -EINVAL;
2116}
2117
2118static const struct file_operations mce_chrdev_ops = {
2119 .open = mce_chrdev_open,
2120 .release = mce_chrdev_release,
2121 .read = mce_chrdev_read,
2122 .write = mce_chrdev_write,
2123 .poll = mce_chrdev_poll,
2124 .unlocked_ioctl = mce_chrdev_ioctl,
2125 .llseek = no_llseek,
2126};
2127
2128static struct miscdevice mce_chrdev_device = {
2129 MISC_MCELOG_MINOR,
2130 "mcelog",
2131 &mce_chrdev_ops,
2132};
2133
2134static void __mce_disable_bank(void *arg) 1813static void __mce_disable_bank(void *arg)
2135{ 1814{
2136 int bank = *((int *)arg); 1815 int bank = *((int *)arg);
@@ -2349,29 +2028,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2349 return size; 2028 return size;
2350} 2029}
2351 2030
2352static ssize_t
2353show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2354{
2355 strcpy(buf, mce_helper);
2356 strcat(buf, "\n");
2357 return strlen(mce_helper) + 1;
2358}
2359
2360static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2361 const char *buf, size_t siz)
2362{
2363 char *p;
2364
2365 strncpy(mce_helper, buf, sizeof(mce_helper));
2366 mce_helper[sizeof(mce_helper)-1] = 0;
2367 p = strchr(mce_helper, '\n');
2368
2369 if (p)
2370 *p = 0;
2371
2372 return strlen(mce_helper) + !!p;
2373}
2374
2375static ssize_t set_ignore_ce(struct device *s, 2031static ssize_t set_ignore_ce(struct device *s,
2376 struct device_attribute *attr, 2032 struct device_attribute *attr,
2377 const char *buf, size_t size) 2033 const char *buf, size_t size)
@@ -2428,7 +2084,6 @@ static ssize_t store_int_with_restart(struct device *s,
2428 return ret; 2084 return ret;
2429} 2085}
2430 2086
2431static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2432static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2087static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2433static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2088static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2434static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2089static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2451,7 +2106,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
2451static struct device_attribute *mce_device_attrs[] = { 2106static struct device_attribute *mce_device_attrs[] = {
2452 &dev_attr_tolerant.attr, 2107 &dev_attr_tolerant.attr,
2453 &dev_attr_check_interval.attr, 2108 &dev_attr_check_interval.attr,
2109#ifdef CONFIG_X86_MCELOG_LEGACY
2454 &dev_attr_trigger, 2110 &dev_attr_trigger,
2111#endif
2455 &dev_attr_monarch_timeout.attr, 2112 &dev_attr_monarch_timeout.attr,
2456 &dev_attr_dont_log_ce.attr, 2113 &dev_attr_dont_log_ce.attr,
2457 &dev_attr_ignore_ce.attr, 2114 &dev_attr_ignore_ce.attr,
@@ -2625,7 +2282,6 @@ static __init void mce_init_banks(void)
2625 2282
2626static __init int mcheck_init_device(void) 2283static __init int mcheck_init_device(void)
2627{ 2284{
2628 enum cpuhp_state hp_online;
2629 int err; 2285 int err;
2630 2286
2631 if (!mce_available(&boot_cpu_data)) { 2287 if (!mce_available(&boot_cpu_data)) {
@@ -2653,21 +2309,11 @@ static __init int mcheck_init_device(void)
2653 mce_cpu_online, mce_cpu_pre_down); 2309 mce_cpu_online, mce_cpu_pre_down);
2654 if (err < 0) 2310 if (err < 0)
2655 goto err_out_online; 2311 goto err_out_online;
2656 hp_online = err;
2657 2312
2658 register_syscore_ops(&mce_syscore_ops); 2313 register_syscore_ops(&mce_syscore_ops);
2659 2314
2660 /* register character device /dev/mcelog */
2661 err = misc_register(&mce_chrdev_device);
2662 if (err)
2663 goto err_register;
2664
2665 return 0; 2315 return 0;
2666 2316
2667err_register:
2668 unregister_syscore_ops(&mce_syscore_ops);
2669 cpuhp_remove_state(hp_online);
2670
2671err_out_online: 2317err_out_online:
2672 cpuhp_remove_state(CPUHP_X86_MCE_DEAD); 2318 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2673 2319
@@ -2675,7 +2321,7 @@ err_out_mem:
2675 free_cpumask_var(mce_device_initialized); 2321 free_cpumask_var(mce_device_initialized);
2676 2322
2677err_out: 2323err_out:
2678 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); 2324 pr_err("Unable to init MCE device (rc: %d)\n", err);
2679 2325
2680 return err; 2326 return err;
2681} 2327}