From 4efc0670baf4b14bc95502e54a83ccf639146125 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 19:07:31 +0200 Subject: x86, mce: use 64bit machine check code on 32bit The 64bit machine check code is in many ways much better than the 32bit machine check code: it is more specification compliant, is cleaner, only has a single code base versus one per CPU, has better infrastructure for recovery, has a cleaner way to communicate with user space etc. etc. Use the 64bit code for 32bit too. This is the second attempt to do this. There was one a couple of years ago to unify this code for 32bit and 64bit. Back then this ran into some trouble with K7s and was reverted. I believe this time the K7 problems (and some others) are addressed. I went over the old handlers and was very careful to retain all quirks. But of course this needs a lot of testing on old systems. On newer 64bit capable systems I don't expect much problems because they have been already tested with the 64bit kernel. I made this a CONFIG for now that still allows to select the old machine check code. This is mostly to make testing easier, if someone runs into a problem we can ask them to try with the CONFIG switched. The new code is default y for more coverage. Once there is confidence the 64bit code works well on older hardware too the CONFIG_X86_OLD_MCE and the associated code can be easily removed. This causes a behaviour change for 32bit installations. They now have to install the mcelog package to be able to log corrected machine checks. The 64bit machine check code only handles CPUs which support the standard Intel machine check architecture described in the IA32 SDM. The 32bit code has special support for some older CPUs which have non standard machine check architectures, in particular WinChip C3 and Intel P5. I made those a separate CONFIG option and kept them for now. The WinChip variant could be probably removed without too much pain, it doesn't really do anything interesting. P5 is also disabled by default (like it was before) because many motherboards have it miswired, but according to Alan Cox a few embedded setups use that one. Forward ported/heavily changed version of old patch, original patch included review/fixes from Thomas Gleixner, Bert Wesarg. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/irq.c') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..35eddc9ec99e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -89,7 +89,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); @@ -176,7 +176,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #endif #ifdef CONFIG_X86_MCE sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif #endif -- cgit v1.2.2 From 01ca79f1411eae2a45352709c838b946b1af9fbd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: x86, mce: add machine check exception count in /proc/interrupts Useful for debugging, but it's also good general policy to have a counter for all special interrupts there. This makes it easier to diagnose where a CPU is spending its time. [ Impact: feature, debugging tool ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel/irq.c') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index a05660bf0299..05fc635c28c0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -93,6 +94,12 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif +#endif +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -161,6 +168,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + sum += per_cpu(mce_exception_count, cpu); +#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; -- cgit v1.2.2 From ca84f69697da0f004135e45b63ca560b6bd3554e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: x86, mce: add MCE poll count to /proc/interrupts Keep a count of the machine check polls (or CMCI events) in /proc/interrupts. Andi needs this for debugging, but it's also useful in general to see what's going in by the kernel. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel/irq.c') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 05fc635c28c0..eff46b5de62f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -100,6 +100,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.2 From 8051dbd2dfd1427cc102888d7d96bf39de0be150 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 2 Jun 2009 16:53:23 +0900 Subject: x86, mce: fix for mce counters Make the MCE counters work on 32bit and add poll count in arch_irq_stat_cpu. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/irq.c') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index eff46b5de62f..9773395aa758 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -95,7 +95,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_NEW_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -172,9 +172,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - sum += per_cpu(mce_exception_count, cpu); -#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; @@ -191,6 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) # ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; # endif +#endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } -- cgit v1.2.2