aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c133
1 files changed, 133 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 13e1b7ffe73a..d4e7b5947a0e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h>
36 37
37#include <asm/processor.h> 38#include <asm/processor.h>
38#include <asm/hw_irq.h> 39#include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
105 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); 106 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
106} 107}
107 108
109static DEFINE_PER_CPU(struct work_struct, mce_work);
110
108/* Do initial initialization of a struct mce */ 111/* Do initial initialization of a struct mce */
109void mce_setup(struct mce *m) 112void mce_setup(struct mce *m)
110{ 113{
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
312 wrmsrl(msr, v); 315 wrmsrl(msr, v);
313} 316}
314 317
318/*
319 * Simple lockless ring to communicate PFNs from the exception handler with the
320 * process context work function. This is vastly simplified because there's
321 * only a single reader and a single writer.
322 */
323#define MCE_RING_SIZE 16 /* we use one entry less */
324
325struct mce_ring {
326 unsigned short start;
327 unsigned short end;
328 unsigned long ring[MCE_RING_SIZE];
329};
330static DEFINE_PER_CPU(struct mce_ring, mce_ring);
331
332/* Runs with CPU affinity in workqueue */
333static int mce_ring_empty(void)
334{
335 struct mce_ring *r = &__get_cpu_var(mce_ring);
336
337 return r->start == r->end;
338}
339
340static int mce_ring_get(unsigned long *pfn)
341{
342 struct mce_ring *r;
343 int ret = 0;
344
345 *pfn = 0;
346 get_cpu();
347 r = &__get_cpu_var(mce_ring);
348 if (r->start == r->end)
349 goto out;
350 *pfn = r->ring[r->start];
351 r->start = (r->start + 1) % MCE_RING_SIZE;
352 ret = 1;
353out:
354 put_cpu();
355 return ret;
356}
357
358/* Always runs in MCE context with preempt off */
359static int mce_ring_add(unsigned long pfn)
360{
361 struct mce_ring *r = &__get_cpu_var(mce_ring);
362 unsigned next;
363
364 next = (r->end + 1) % MCE_RING_SIZE;
365 if (next == r->start)
366 return -1;
367 r->ring[r->end] = pfn;
368 wmb();
369 r->end = next;
370 return 0;
371}
372
315int mce_available(struct cpuinfo_x86 *c) 373int mce_available(struct cpuinfo_x86 *c)
316{ 374{
317 if (mce_disabled) 375 if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
319 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 377 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
320} 378}
321 379
380static void mce_schedule_work(void)
381{
382 if (!mce_ring_empty()) {
383 struct work_struct *work = &__get_cpu_var(mce_work);
384 if (!work_pending(work))
385 schedule_work(work);
386 }
387}
388
322/* 389/*
323 * Get the address of the instruction at the time of the machine check 390 * Get the address of the instruction at the time of the machine check
324 * error. 391 * error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
349 exit_idle(); 416 exit_idle();
350 irq_enter(); 417 irq_enter();
351 mce_notify_irq(); 418 mce_notify_irq();
419 mce_schedule_work();
352 irq_exit(); 420 irq_exit();
353} 421}
354#endif 422#endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
357{ 425{
358 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { 426 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
359 mce_notify_irq(); 427 mce_notify_irq();
428 /*
429 * Triggering the work queue here is just an insurance
430 * policy in case the syscall exit notify handler
431 * doesn't run soon enough or ends up running on the
432 * wrong CPU (can happen when audit sleeps)
433 */
434 mce_schedule_work();
360 return; 435 return;
361 } 436 }
362 437
@@ -731,6 +806,23 @@ reset:
731 return ret; 806 return ret;
732} 807}
733 808
809/*
810 * Check if the address reported by the CPU is in a format we can parse.
811 * It would be possible to add code for most other cases, but all would
812 * be somewhat complicated (e.g. segment offset would require an instruction
813 * parser). So only support physical addresses upto page granuality for now.
814 */
815static int mce_usable_address(struct mce *m)
816{
817 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
818 return 0;
819 if ((m->misc & 0x3f) > PAGE_SHIFT)
820 return 0;
821 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
822 return 0;
823 return 1;
824}
825
734static void mce_clear_state(unsigned long *toclear) 826static void mce_clear_state(unsigned long *toclear)
735{ 827{
736 int i; 828 int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
865 if (m.status & MCI_STATUS_ADDRV) 957 if (m.status & MCI_STATUS_ADDRV)
866 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 958 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
867 959
960 /*
961 * Action optional error. Queue address for later processing.
962 * When the ring overflows we just ignore the AO error.
963 * RED-PEN add some logging mechanism when
964 * usable_address or mce_add_ring fails.
965 * RED-PEN don't ignore overflow for tolerant == 0
966 */
967 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
968 mce_ring_add(m.addr >> PAGE_SHIFT);
969
868 mce_get_rip(&m, regs); 970 mce_get_rip(&m, regs);
869 mce_log(&m); 971 mce_log(&m);
870 972
@@ -916,6 +1018,36 @@ out:
916} 1018}
917EXPORT_SYMBOL_GPL(do_machine_check); 1019EXPORT_SYMBOL_GPL(do_machine_check);
918 1020
1021/* dummy to break dependency. actual code is in mm/memory-failure.c */
1022void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1023{
1024 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1025}
1026
1027/*
1028 * Called after mce notification in process context. This code
1029 * is allowed to sleep. Call the high level VM handler to process
1030 * any corrupted pages.
1031 * Assume that the work queue code only calls this one at a time
1032 * per CPU.
1033 * Note we don't disable preemption, so this code might run on the wrong
1034 * CPU. In this case the event is picked up by the scheduled work queue.
1035 * This is merely a fast path to expedite processing in some common
1036 * cases.
1037 */
1038void mce_notify_process(void)
1039{
1040 unsigned long pfn;
1041 mce_notify_irq();
1042 while (mce_ring_get(&pfn))
1043 memory_failure(pfn, MCE_VECTOR);
1044}
1045
1046static void mce_process_work(struct work_struct *dummy)
1047{
1048 mce_notify_process();
1049}
1050
919#ifdef CONFIG_X86_MCE_INTEL 1051#ifdef CONFIG_X86_MCE_INTEL
920/*** 1052/***
921 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog 1053 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1204 mce_init(); 1336 mce_init();
1205 mce_cpu_features(c); 1337 mce_cpu_features(c);
1206 mce_init_timer(); 1338 mce_init_timer();
1339 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1207} 1340}
1208 1341
1209/* 1342/*