aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorTim Hockin <thockin@google.com>2007-07-21 11:10:36 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-21 21:37:10 -0400
commite02e68d31e2d436197386997727b216fee9c4623 (patch)
tree97160f82b8deaf6e83d988844d5b410baa4e3ab4 /arch/x86_64
parentf528e7ba28492e363a64c80c414ded4cadf48f89 (diff)
x86_64: support poll() on /dev/mcelog
Background: /dev/mcelog is typically polled manually. This is less than optimal for situations where accurate accounting of MCEs is important. Calling poll() on /dev/mcelog does not work. Description: This patch adds support for poll() to /dev/mcelog. This results in immediate wakeup of user apps whenever the poller finds MCEs. Because the exception handler can not take any locks, it can not call the wakeup itself. Instead, it uses a thread_info flag (TIF_MCE_NOTIFY) which is caught at the next return from interrupt or exit from idle, calling the mce_user_notify() routine. This patch also disables the "fake panic" path of the mce_panic(), because it results in printk()s in the exception handler and crashy systems. This patch also does some small cleanup for essentially unused variables, and moves the user notification into the body of the poller, so it is only called once per poll, rather than once per CPU. Result: Applications can now poll() on /dev/mcelog. When an error is logged (whether through the poller or through an exception) the applications are woken up promptly. This should not affect any previous behaviors. If no MCEs are being logged, there is no overhead. Alternatives: I considered simply supporting poll() through the poller and not using TIF_MCE_NOTIFY at all. However, the time between an uncorrectable error happening and the user application being notified is *the*most* critical window for us. Many uncorrectable errors can be logged to the network if given a chance. I also considered doing the MCE poll directly from the idle notifier, but decided that was overkill. Testing: I used an error-injecting DIMM to create lots of correctable DRAM errors and verified that my user app is woken up in sync with the polling interval. I also used the northbridge to inject uncorrectable ECC errors, and verified (printk() to the rescue) that the notify routine is called and the user app does wake up. I built with PREEMPT on and off, and verified that my machine survives MCEs. [wli@holomorphy.com: build fix] Signed-off-by: Tim Hockin <thockin@google.com> Signed-off-by: William Irwin <bill.irwin@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/kernel/entry.S6
-rw-r--r--arch/x86_64/kernel/mce.c105
-rw-r--r--arch/x86_64/kernel/signal.c7
3 files changed, 78 insertions, 40 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index a67f87bf401..830cfc6ee8c 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -282,7 +282,7 @@ sysret_careful:
282sysret_signal: 282sysret_signal:
283 TRACE_IRQS_ON 283 TRACE_IRQS_ON
284 sti 284 sti
285 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx 285 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
286 jz 1f 286 jz 1f
287 287
288 /* Really a signal */ 288 /* Really a signal */
@@ -375,7 +375,7 @@ int_very_careful:
375 jmp int_restore_rest 375 jmp int_restore_rest
376 376
377int_signal: 377int_signal:
378 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx 378 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
379 jz 1f 379 jz 1f
380 movq %rsp,%rdi # &ptregs -> arg1 380 movq %rsp,%rdi # &ptregs -> arg1
381 xorl %esi,%esi # oldset -> arg2 381 xorl %esi,%esi # oldset -> arg2
@@ -599,7 +599,7 @@ retint_careful:
599 jmp retint_check 599 jmp retint_check
600 600
601retint_signal: 601retint_signal:
602 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx 602 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
603 jz retint_swapgs 603 jz retint_swapgs
604 TRACE_IRQS_ON 604 TRACE_IRQS_ON
605 sti 605 sti
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 77fee481be4..968613572b9 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -18,6 +18,8 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/poll.h>
22#include <linux/thread_info.h>
21#include <linux/ctype.h> 23#include <linux/ctype.h>
22#include <linux/kmod.h> 24#include <linux/kmod.h>
23#include <linux/kdebug.h> 25#include <linux/kdebug.h>
@@ -26,6 +28,7 @@
26#include <asm/mce.h> 28#include <asm/mce.h>
27#include <asm/uaccess.h> 29#include <asm/uaccess.h>
28#include <asm/smp.h> 30#include <asm/smp.h>
31#include <asm/idle.h>
29 32
30#define MISC_MCELOG_MINOR 227 33#define MISC_MCELOG_MINOR 227
31#define NR_BANKS 6 34#define NR_BANKS 6
@@ -39,8 +42,7 @@ static int mce_dont_init;
39static int tolerant = 1; 42static int tolerant = 1;
40static int banks; 43static int banks;
41static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; 44static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42static unsigned long console_logged; 45static unsigned long notify_user;
43static int notify_user;
44static int rip_msr; 46static int rip_msr;
45static int mce_bootlog = 1; 47static int mce_bootlog = 1;
46static atomic_t mce_events; 48static atomic_t mce_events;
@@ -48,6 +50,8 @@ static atomic_t mce_events;
48static char trigger[128]; 50static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL }; 51static char *trigger_argv[2] = { trigger, NULL };
50 52
53static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
54
51/* 55/*
52 * Lockless MCE logging infrastructure. 56 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also 57 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -94,8 +98,7 @@ void mce_log(struct mce *mce)
94 mcelog.entry[entry].finished = 1; 98 mcelog.entry[entry].finished = 1;
95 wmb(); 99 wmb();
96 100
97 if (!test_and_set_bit(0, &console_logged)) 101 set_bit(0, &notify_user);
98 notify_user = 1;
99} 102}
100 103
101static void print_mce(struct mce *m) 104static void print_mce(struct mce *m)
@@ -128,6 +131,10 @@ static void print_mce(struct mce *m)
128static void mce_panic(char *msg, struct mce *backup, unsigned long start) 131static void mce_panic(char *msg, struct mce *backup, unsigned long start)
129{ 132{
130 int i; 133 int i;
134
135 if (tolerant >= 3)
136 return;
137
131 oops_begin(); 138 oops_begin();
132 for (i = 0; i < MCE_LOG_LEN; i++) { 139 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc; 140 unsigned long tsc = mcelog.entry[i].tsc;
@@ -139,10 +146,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
139 } 146 }
140 if (backup) 147 if (backup)
141 print_mce(backup); 148 print_mce(backup);
142 if (tolerant >= 3) 149 panic(msg);
143 printk("Fake panic: %s\n", msg);
144 else
145 panic(msg);
146} 150}
147 151
148static int mce_available(struct cpuinfo_x86 *c) 152static int mce_available(struct cpuinfo_x86 *c)
@@ -167,17 +171,6 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
167 } 171 }
168} 172}
169 173
170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
178 }
179}
180
181/* 174/*
182 * The actual machine check handler 175 * The actual machine check handler
183 */ 176 */
@@ -251,12 +244,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
251 } 244 }
252 245
253 /* Never do anything final in the polling timer */ 246 /* Never do anything final in the polling timer */
254 if (!regs) { 247 if (!regs)
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
258 goto out; 248 goto out;
259 }
260 249
261 /* If we didn't find an uncorrectable error, pick 250 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */ 251 the last one (shouldn't happen, just being safe). */
@@ -288,6 +277,9 @@ void do_machine_check(struct pt_regs * regs, long error_code)
288 do_exit(SIGBUS); 277 do_exit(SIGBUS);
289 } 278 }
290 279
280 /* notify userspace ASAP */
281 set_thread_flag(TIF_MCE_NOTIFY);
282
291 out: 283 out:
292 /* Last thing done in the machine check exception to clear state. */ 284 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0); 285 wrmsrl(MSR_IA32_MCG_STATUS, 0);
@@ -344,37 +336,67 @@ static void mcheck_timer(struct work_struct *work)
344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1); 336 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
345 337
346 /* 338 /*
347 * It's ok to read stale data here for notify_user and 339 * Alert userspace if needed. If we logged an MCE, reduce the
348 * console_logged as we'll simply get the updated versions 340 * polling interval, otherwise increase the polling interval.
349 * on the next mcheck_timer execution and atomic operations
350 * on console_logged act as synchronization for notify_user
351 * writes.
352 */ 341 */
353 if (notify_user && console_logged) { 342 if (mce_notify_user()) {
343 next_interval = max(next_interval/2, HZ/100);
344 } else {
345 next_interval = min(next_interval*2, check_interval*HZ);
346 }
347
348 schedule_delayed_work(&mcheck_work, next_interval);
349}
350
351/*
352 * This is only called from process context. This is where we do
353 * anything we need to alert userspace about new MCEs. This is called
354 * directly from the poller and also from entry.S and idle, thanks to
355 * TIF_MCE_NOTIFY.
356 */
357int mce_notify_user(void)
358{
359 clear_thread_flag(TIF_MCE_NOTIFY);
360 if (test_and_clear_bit(0, &notify_user)) {
354 static unsigned long last_print; 361 static unsigned long last_print;
355 unsigned long now = jiffies; 362 unsigned long now = jiffies;
356 363
357 /* if we logged an MCE, reduce the polling interval */ 364 wake_up_interruptible(&mce_wait);
358 next_interval = max(next_interval/2, HZ/100); 365 if (trigger[0])
359 notify_user = 0; 366 call_usermodehelper(trigger, trigger_argv, NULL,
360 clear_bit(0, &console_logged); 367 UMH_NO_WAIT);
368
361 if (time_after_eq(now, last_print + (check_interval*HZ))) { 369 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now; 370 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n"); 371 printk(KERN_INFO "Machine check events logged\n");
364 } 372 }
365 } else { 373
366 next_interval = min(next_interval*2, check_interval*HZ); 374 return 1;
367 } 375 }
376 return 0;
377}
368 378
369 schedule_delayed_work(&mcheck_work, next_interval); 379/* see if the idle task needs to notify userspace */
380static int
381mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
382{
383 /* IDLE_END should be safe - interrupts are back on */
384 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
385 mce_notify_user();
386
387 return NOTIFY_OK;
370} 388}
371 389
390static struct notifier_block mce_idle_notifier = {
391 .notifier_call = mce_idle_callback,
392};
372 393
373static __init int periodic_mcheck_init(void) 394static __init int periodic_mcheck_init(void)
374{ 395{
375 next_interval = check_interval * HZ; 396 next_interval = check_interval * HZ;
376 if (next_interval) 397 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval); 398 schedule_delayed_work(&mcheck_work, next_interval);
399 idle_notifier_register(&mce_idle_notifier);
378 return 0; 400 return 0;
379} 401}
380__initcall(periodic_mcheck_init); 402__initcall(periodic_mcheck_init);
@@ -566,6 +588,14 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
566 return err ? -EFAULT : buf - ubuf; 588 return err ? -EFAULT : buf - ubuf;
567} 589}
568 590
591static unsigned int mce_poll(struct file *file, poll_table *wait)
592{
593 poll_wait(file, &mce_wait, wait);
594 if (rcu_dereference(mcelog.next))
595 return POLLIN | POLLRDNORM;
596 return 0;
597}
598
569static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) 599static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
570{ 600{
571 int __user *p = (int __user *)arg; 601 int __user *p = (int __user *)arg;
@@ -592,6 +622,7 @@ static const struct file_operations mce_chrdev_ops = {
592 .open = mce_open, 622 .open = mce_open,
593 .release = mce_release, 623 .release = mce_release,
594 .read = mce_read, 624 .read = mce_read,
625 .poll = mce_poll,
595 .ioctl = mce_ioctl, 626 .ioctl = mce_ioctl,
596}; 627};
597 628
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 290f5d8037c..4886afcd628 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -26,6 +26,7 @@
26#include <asm/i387.h> 26#include <asm/i387.h>
27#include <asm/proto.h> 27#include <asm/proto.h>
28#include <asm/ia32_unistd.h> 28#include <asm/ia32_unistd.h>
29#include <asm/mce.h>
29 30
30/* #define DEBUG_SIG 1 */ 31/* #define DEBUG_SIG 1 */
31 32
@@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
472 clear_thread_flag(TIF_SINGLESTEP); 473 clear_thread_flag(TIF_SINGLESTEP);
473 } 474 }
474 475
476#ifdef CONFIG_X86_MCE
477 /* notify userspace of pending MCEs */
478 if (thread_info_flags & _TIF_MCE_NOTIFY)
479 mce_notify_user();
480#endif /* CONFIG_X86_MCE */
481
475 /* deal with pending signal delivery */ 482 /* deal with pending signal delivery */
476 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) 483 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
477 do_signal(regs); 484 do_signal(regs);