diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73a..d4e7b5947a0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
35 | #include <linux/fs.h> | 35 | #include <linux/fs.h> |
36 | #include <linux/mm.h> | ||
36 | 37 | ||
37 | #include <asm/processor.h> | 38 | #include <asm/processor.h> |
38 | #include <asm/hw_irq.h> | 39 | #include <asm/hw_irq.h> |
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i) | |||
105 | return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); | 106 | return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); |
106 | } | 107 | } |
107 | 108 | ||
109 | static DEFINE_PER_CPU(struct work_struct, mce_work); | ||
110 | |||
108 | /* Do initial initialization of a struct mce */ | 111 | /* Do initial initialization of a struct mce */ |
109 | void mce_setup(struct mce *m) | 112 | void mce_setup(struct mce *m) |
110 | { | 113 | { |
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v) | |||
312 | wrmsrl(msr, v); | 315 | wrmsrl(msr, v); |
313 | } | 316 | } |
314 | 317 | ||
318 | /* | ||
319 | * Simple lockless ring to communicate PFNs from the exception handler with the | ||
320 | * process context work function. This is vastly simplified because there's | ||
321 | * only a single reader and a single writer. | ||
322 | */ | ||
323 | #define MCE_RING_SIZE 16 /* we use one entry less */ | ||
324 | |||
325 | struct mce_ring { | ||
326 | unsigned short start; | ||
327 | unsigned short end; | ||
328 | unsigned long ring[MCE_RING_SIZE]; | ||
329 | }; | ||
330 | static DEFINE_PER_CPU(struct mce_ring, mce_ring); | ||
331 | |||
332 | /* Runs with CPU affinity in workqueue */ | ||
333 | static int mce_ring_empty(void) | ||
334 | { | ||
335 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
336 | |||
337 | return r->start == r->end; | ||
338 | } | ||
339 | |||
340 | static int mce_ring_get(unsigned long *pfn) | ||
341 | { | ||
342 | struct mce_ring *r; | ||
343 | int ret = 0; | ||
344 | |||
345 | *pfn = 0; | ||
346 | get_cpu(); | ||
347 | r = &__get_cpu_var(mce_ring); | ||
348 | if (r->start == r->end) | ||
349 | goto out; | ||
350 | *pfn = r->ring[r->start]; | ||
351 | r->start = (r->start + 1) % MCE_RING_SIZE; | ||
352 | ret = 1; | ||
353 | out: | ||
354 | put_cpu(); | ||
355 | return ret; | ||
356 | } | ||
357 | |||
358 | /* Always runs in MCE context with preempt off */ | ||
359 | static int mce_ring_add(unsigned long pfn) | ||
360 | { | ||
361 | struct mce_ring *r = &__get_cpu_var(mce_ring); | ||
362 | unsigned next; | ||
363 | |||
364 | next = (r->end + 1) % MCE_RING_SIZE; | ||
365 | if (next == r->start) | ||
366 | return -1; | ||
367 | r->ring[r->end] = pfn; | ||
368 | wmb(); | ||
369 | r->end = next; | ||
370 | return 0; | ||
371 | } | ||
372 | |||
315 | int mce_available(struct cpuinfo_x86 *c) | 373 | int mce_available(struct cpuinfo_x86 *c) |
316 | { | 374 | { |
317 | if (mce_disabled) | 375 | if (mce_disabled) |
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c) | |||
319 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | 377 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); |
320 | } | 378 | } |
321 | 379 | ||
380 | static void mce_schedule_work(void) | ||
381 | { | ||
382 | if (!mce_ring_empty()) { | ||
383 | struct work_struct *work = &__get_cpu_var(mce_work); | ||
384 | if (!work_pending(work)) | ||
385 | schedule_work(work); | ||
386 | } | ||
387 | } | ||
388 | |||
322 | /* | 389 | /* |
323 | * Get the address of the instruction at the time of the machine check | 390 | * Get the address of the instruction at the time of the machine check |
324 | * error. | 391 | * error. |
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) | |||
349 | exit_idle(); | 416 | exit_idle(); |
350 | irq_enter(); | 417 | irq_enter(); |
351 | mce_notify_irq(); | 418 | mce_notify_irq(); |
419 | mce_schedule_work(); | ||
352 | irq_exit(); | 420 | irq_exit(); |
353 | } | 421 | } |
354 | #endif | 422 | #endif |
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs) | |||
357 | { | 425 | { |
358 | if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { | 426 | if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { |
359 | mce_notify_irq(); | 427 | mce_notify_irq(); |
428 | /* | ||
429 | * Triggering the work queue here is just an insurance | ||
430 | * policy in case the syscall exit notify handler | ||
431 | * doesn't run soon enough or ends up running on the | ||
432 | * wrong CPU (can happen when audit sleeps) | ||
433 | */ | ||
434 | mce_schedule_work(); | ||
360 | return; | 435 | return; |
361 | } | 436 | } |
362 | 437 | ||
@@ -731,6 +806,23 @@ reset: | |||
731 | return ret; | 806 | return ret; |
732 | } | 807 | } |
733 | 808 | ||
809 | /* | ||
810 | * Check if the address reported by the CPU is in a format we can parse. | ||
811 | * It would be possible to add code for most other cases, but all would | ||
812 | * be somewhat complicated (e.g. segment offset would require an instruction | ||
813 | * parser). So only support physical addresses upto page granuality for now. | ||
814 | */ | ||
815 | static int mce_usable_address(struct mce *m) | ||
816 | { | ||
817 | if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) | ||
818 | return 0; | ||
819 | if ((m->misc & 0x3f) > PAGE_SHIFT) | ||
820 | return 0; | ||
821 | if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) | ||
822 | return 0; | ||
823 | return 1; | ||
824 | } | ||
825 | |||
734 | static void mce_clear_state(unsigned long *toclear) | 826 | static void mce_clear_state(unsigned long *toclear) |
735 | { | 827 | { |
736 | int i; | 828 | int i; |
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
865 | if (m.status & MCI_STATUS_ADDRV) | 957 | if (m.status & MCI_STATUS_ADDRV) |
866 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | 958 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); |
867 | 959 | ||
960 | /* | ||
961 | * Action optional error. Queue address for later processing. | ||
962 | * When the ring overflows we just ignore the AO error. | ||
963 | * RED-PEN add some logging mechanism when | ||
964 | * usable_address or mce_add_ring fails. | ||
965 | * RED-PEN don't ignore overflow for tolerant == 0 | ||
966 | */ | ||
967 | if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) | ||
968 | mce_ring_add(m.addr >> PAGE_SHIFT); | ||
969 | |||
868 | mce_get_rip(&m, regs); | 970 | mce_get_rip(&m, regs); |
869 | mce_log(&m); | 971 | mce_log(&m); |
870 | 972 | ||
@@ -916,6 +1018,36 @@ out: | |||
916 | } | 1018 | } |
917 | EXPORT_SYMBOL_GPL(do_machine_check); | 1019 | EXPORT_SYMBOL_GPL(do_machine_check); |
918 | 1020 | ||
1021 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | ||
1022 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | ||
1023 | { | ||
1024 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | ||
1025 | } | ||
1026 | |||
1027 | /* | ||
1028 | * Called after mce notification in process context. This code | ||
1029 | * is allowed to sleep. Call the high level VM handler to process | ||
1030 | * any corrupted pages. | ||
1031 | * Assume that the work queue code only calls this one at a time | ||
1032 | * per CPU. | ||
1033 | * Note we don't disable preemption, so this code might run on the wrong | ||
1034 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1035 | * This is merely a fast path to expedite processing in some common | ||
1036 | * cases. | ||
1037 | */ | ||
1038 | void mce_notify_process(void) | ||
1039 | { | ||
1040 | unsigned long pfn; | ||
1041 | mce_notify_irq(); | ||
1042 | while (mce_ring_get(&pfn)) | ||
1043 | memory_failure(pfn, MCE_VECTOR); | ||
1044 | } | ||
1045 | |||
1046 | static void mce_process_work(struct work_struct *dummy) | ||
1047 | { | ||
1048 | mce_notify_process(); | ||
1049 | } | ||
1050 | |||
919 | #ifdef CONFIG_X86_MCE_INTEL | 1051 | #ifdef CONFIG_X86_MCE_INTEL |
920 | /*** | 1052 | /*** |
921 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog | 1053 | * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog |
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | |||
1204 | mce_init(); | 1336 | mce_init(); |
1205 | mce_cpu_features(c); | 1337 | mce_cpu_features(c); |
1206 | mce_init_timer(); | 1338 | mce_init_timer(); |
1339 | INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); | ||
1207 | } | 1340 | } |
1208 | 1341 | ||
1209 | /* | 1342 | /* |