aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-05-27 15:56:59 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-06-03 17:48:59 -0400
commit9b1beaf2b551a8a1604f104025b24e9c535c8963 (patch)
treeb335ca7e4744c6de875c6421a6131539094ae851
parent8fa8dd9e3aafb7b440b7d54219891615abc6390e (diff)
x86, mce: support action-optional machine checks
Newer Intel CPUs support a new class of machine checks called recoverable action optional. Action Optional means that the CPU detected some form of corruption in the background and tells the OS about using a machine check exception. The OS can then take appropiate action, like killing the process with the corrupted data or logging the event properly to disk. This is done by the new generic high level memory failure handler added in a earlier patch. The high level handler takes the address with the failed memory and does the appropiate action, like killing the process. In this version of the patch the high level handler is stubbed out with a weak function to not create a direct dependency on the hwpoison branch. The high level handler cannot be directly called from the machine check exception though, because it has to run in a defined process context to be able to sleep when taking VM locks (it is not expected to sleep for a long time, just do so in some exceptional cases like lock contention) Thus the MCE handler has to queue a work item for process context, trigger process context and then call the high level handler from there. This patch adds two path to process context: through a per thread kernel exit notify_user() callback or through a high priority work item. The first runs when the process exits back to user space, the other when it goes to sleep and there is no higher priority process. The machine check handler will schedule both, and whoever runs first will grab the event. This is done because quick reaction to this event is critical to avoid a potential more fatal machine check when the corruption is consumed. There is a simple lock less ring buffer to queue the corrupted addresses between the exception handler and the process context handler. Then in process context it just calls the high level VM code with the corrupted PFNs. The code adds the required code to extract the failed address from the CPU's machine check registers. It doesn't try to handle all possible cases -- the specification has 6 different ways to specify memory address -- but only the linear address. Most of the required checking has been already done earlier in the mce_severity rule checking engine. Following the Intel recommendations Action Optional errors are only enabled for known situations (encoded in MCACODs). The errors are ignored otherwise, because they are action optional. v2: Improve comment, disable preemption while processing ring buffer (reported by Ying Huang) Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c133
-rw-r--r--arch/x86/kernel/signal.c2
3 files changed, 135 insertions, 1 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 713926b62cbb..82978ad12072 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -160,6 +160,7 @@ enum mcp_flags {
160void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 160void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
161 161
162int mce_notify_irq(void); 162int mce_notify_irq(void);
163void mce_notify_process(void);
163 164
164DECLARE_PER_CPU(struct mce, injectm); 165DECLARE_PER_CPU(struct mce, injectm);
165extern struct file_operations mce_chrdev_ops; 166extern struct file_operations mce_chrdev_ops;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 13e1b7ffe73a..d4e7b5947a0e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h>
36 37
37#include <asm/processor.h> 38#include <asm/processor.h>
38#include <asm/hw_irq.h> 39#include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
105 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 106 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
106} 107}
107 108
109static DEFINE_PER_CPU(struct work_struct, mce_work);
110
108/* Do initial initialization of a struct mce */ 111/* Do initial initialization of a struct mce */
109void mce_setup(struct mce *m) 112void mce_setup(struct mce *m)
110{ 113{
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
312 wrmsrl(msr, v); 315 wrmsrl(msr, v);
313} 316}
314 317
318/*
319 * Simple lockless ring to communicate PFNs from the exception handler with the
320 * process context work function. This is vastly simplified because there's
321 * only a single reader and a single writer.
322 */
323#define MCE_RING_SIZE 16 /* we use one entry less */
324
325struct mce_ring {
326 unsigned short start;
327 unsigned short end;
328 unsigned long ring[MCE_RING_SIZE];
329};
330static DEFINE_PER_CPU(struct mce_ring, mce_ring);
331
332/* Runs with CPU affinity in workqueue */
333static int mce_ring_empty(void)
334{
335 struct mce_ring *r = &__get_cpu_var(mce_ring);
336
337 return r->start == r->end;
338}
339
340static int mce_ring_get(unsigned long *pfn)
341{
342 struct mce_ring *r;
343 int ret = 0;
344
345 *pfn = 0;
346 get_cpu();
347 r = &__get_cpu_var(mce_ring);
348 if (r->start == r->end)
349 goto out;
350 *pfn = r->ring[r->start];
351 r->start = (r->start + 1) % MCE_RING_SIZE;
352 ret = 1;
353out:
354 put_cpu();
355 return ret;
356}
357
358/* Always runs in MCE context with preempt off */
359static int mce_ring_add(unsigned long pfn)
360{
361 struct mce_ring *r = &__get_cpu_var(mce_ring);
362 unsigned next;
363
364 next = (r->end + 1) % MCE_RING_SIZE;
365 if (next == r->start)
366 return -1;
367 r->ring[r->end] = pfn;
368 wmb();
369 r->end = next;
370 return 0;
371}
372
315int mce_available(struct cpuinfo_x86 *c) 373int mce_available(struct cpuinfo_x86 *c)
316{ 374{
317 if (mce_disabled) 375 if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
319 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 377 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
320} 378}
321 379
380static void mce_schedule_work(void)
381{
382 if (!mce_ring_empty()) {
383 struct work_struct *work = &__get_cpu_var(mce_work);
384 if (!work_pending(work))
385 schedule_work(work);
386 }
387}
388
322/* 389/*
323 * Get the address of the instruction at the time of the machine check 390 * Get the address of the instruction at the time of the machine check
324 * error. 391 * error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
349 exit_idle(); 416 exit_idle();
350 irq_enter(); 417 irq_enter();
351 mce_notify_irq(); 418 mce_notify_irq();
419 mce_schedule_work();
352 irq_exit(); 420 irq_exit();
353} 421}
354#endif 422#endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
357{ 425{
358 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 426 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
359 mce_notify_irq(); 427 mce_notify_irq();
428 /*
429 * Triggering the work queue here is just an insurance
430 * policy in case the syscall exit notify handler
431 * doesn't run soon enough or ends up running on the
432 * wrong CPU (can happen when audit sleeps)
433 */
434 mce_schedule_work();
360 return; 435 return;
361 } 436 }
362 437
@@ -731,6 +806,23 @@ reset:
731 return ret; 806 return ret;
732} 807}
733 808
809/*
810 * Check if the address reported by the CPU is in a format we can parse.
811 * It would be possible to add code for most other cases, but all would
812 * be somewhat complicated (e.g. segment offset would require an instruction
813 * parser). So only support physical addresses upto page granuality for now.
814 */
815static int mce_usable_address(struct mce *m)
816{
817 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
818 return 0;
819 if ((m->misc & 0x3f) > PAGE_SHIFT)
820 return 0;
821 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
822 return 0;
823 return 1;
824}
825
734static void mce_clear_state(unsigned long *toclear) 826static void mce_clear_state(unsigned long *toclear)
735{ 827{
736 int i; 828 int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
865 if (m.status & MCI_STATUS_ADDRV) 957 if (m.status & MCI_STATUS_ADDRV)
866 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 958 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
867 959
960 /*
961 * Action optional error. Queue address for later processing.
962 * When the ring overflows we just ignore the AO error.
963 * RED-PEN add some logging mechanism when
964 * usable_address or mce_add_ring fails.
965 * RED-PEN don't ignore overflow for tolerant == 0
966 */
967 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
968 mce_ring_add(m.addr >> PAGE_SHIFT);
969
868 mce_get_rip(&m, regs); 970 mce_get_rip(&m, regs);
869 mce_log(&m); 971 mce_log(&m);
870 972
@@ -916,6 +1018,36 @@ out:
916} 1018}
917EXPORT_SYMBOL_GPL(do_machine_check); 1019EXPORT_SYMBOL_GPL(do_machine_check);
918 1020
1021/* dummy to break dependency. actual code is in mm/memory-failure.c */
1022void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1023{
1024 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1025}
1026
1027/*
1028 * Called after mce notification in process context. This code
1029 * is allowed to sleep. Call the high level VM handler to process
1030 * any corrupted pages.
1031 * Assume that the work queue code only calls this one at a time
1032 * per CPU.
1033 * Note we don't disable preemption, so this code might run on the wrong
1034 * CPU. In this case the event is picked up by the scheduled work queue.
1035 * This is merely a fast path to expedite processing in some common
1036 * cases.
1037 */
1038void mce_notify_process(void)
1039{
1040 unsigned long pfn;
1041 mce_notify_irq();
1042 while (mce_ring_get(&pfn))
1043 memory_failure(pfn, MCE_VECTOR);
1044}
1045
1046static void mce_process_work(struct work_struct *dummy)
1047{
1048 mce_notify_process();
1049}
1050
919#ifdef CONFIG_X86_MCE_INTEL 1051#ifdef CONFIG_X86_MCE_INTEL
920/*** 1052/***
921 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1053 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1204 mce_init(); 1336 mce_init();
1205 mce_cpu_features(c); 1337 mce_cpu_features(c);
1206 mce_init_timer(); 1338 mce_init_timer();
1339 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1207} 1340}
1208 1341
1209/* 1342/*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d5dc15bce005..4976888094f0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
860#ifdef CONFIG_X86_NEW_MCE 860#ifdef CONFIG_X86_NEW_MCE
861 /* notify userspace of pending MCEs */ 861 /* notify userspace of pending MCEs */
862 if (thread_info_flags & _TIF_MCE_NOTIFY) 862 if (thread_info_flags & _TIF_MCE_NOTIFY)
863 mce_notify_irq(); 863 mce_notify_process();
864#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 864#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
865 865
866 /* deal with pending signal delivery */ 866 /* deal with pending signal delivery */