diff options
author | Jack F Vogel <jfv@bluesong.net> | 2005-05-01 11:58:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-05-01 11:58:48 -0400 |
commit | 67701ae9767534534d3710664037dfde2cc04935 (patch) | |
tree | 6adb8d33585f8eee20794827c79e40991aeeaee5 | |
parent | fd51f666fa591294bd7462447512666e61c56ea0 (diff) |
[PATCH] check nmi watchdog is broken
A bug against an xSeries system showed up recently noting that the
check_nmi_watchdog() test was failing.
I have been investigating it and discovered in both i386 and x86_64 the
recent change to the routine to use the cpu_callin_map has uncovered a
problem. Prior to that change, on an SMP box, the test was trivally
passing because all cpu's were found to not yet be online, but now with the
callin_map they are discovered, it goes on to test the counter and they
have not yet begun to increment, so it announces a CPU is stuck and bails
out.
On all the systems I have access to test, the announcement of failure is
also bougs... by the time you can login and check /proc/interrupts, the
NMI count is happily incrementing on all CPUs. Its just that the test is
being done too early.
I have tried moving the call to the test around a bit, and it was always
too early. I finally hit on this proposed solution, it delays the routine
via a late_initcall(), seems like the right solution to me.
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/i386/kernel/apic.c | 2 | ||||
-rw-r--r-- | arch/i386/kernel/io_apic.c | 2 | ||||
-rw-r--r-- | arch/i386/kernel/nmi.c | 11 | ||||
-rw-r--r-- | arch/i386/kernel/smpboot.c | 3 | ||||
-rw-r--r-- | arch/x86_64/kernel/io_apic.c | 2 | ||||
-rw-r--r-- | arch/x86_64/kernel/nmi.c | 9 | ||||
-rw-r--r-- | include/asm-i386/apic.h | 1 | ||||
-rw-r--r-- | include/asm-x86_64/apic.h | 1 |
8 files changed, 14 insertions, 17 deletions
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index e3879f7625c2..d509836b70c3 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c | |||
@@ -1265,8 +1265,6 @@ int __init APIC_init_uniprocessor (void) | |||
1265 | 1265 | ||
1266 | setup_local_APIC(); | 1266 | setup_local_APIC(); |
1267 | 1267 | ||
1268 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1269 | check_nmi_watchdog(); | ||
1270 | #ifdef CONFIG_X86_IO_APIC | 1268 | #ifdef CONFIG_X86_IO_APIC |
1271 | if (smp_found_config) | 1269 | if (smp_found_config) |
1272 | if (!skip_ioapic_setup && nr_ioapics) | 1270 | if (!skip_ioapic_setup && nr_ioapics) |
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 5e0d55be5435..7a324e8b86f9 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c | |||
@@ -2175,7 +2175,6 @@ static inline void check_timer(void) | |||
2175 | disable_8259A_irq(0); | 2175 | disable_8259A_irq(0); |
2176 | setup_nmi(); | 2176 | setup_nmi(); |
2177 | enable_8259A_irq(0); | 2177 | enable_8259A_irq(0); |
2178 | check_nmi_watchdog(); | ||
2179 | } | 2178 | } |
2180 | return; | 2179 | return; |
2181 | } | 2180 | } |
@@ -2198,7 +2197,6 @@ static inline void check_timer(void) | |||
2198 | add_pin_to_irq(0, 0, pin2); | 2197 | add_pin_to_irq(0, 0, pin2); |
2199 | if (nmi_watchdog == NMI_IO_APIC) { | 2198 | if (nmi_watchdog == NMI_IO_APIC) { |
2200 | setup_nmi(); | 2199 | setup_nmi(); |
2201 | check_nmi_watchdog(); | ||
2202 | } | 2200 | } |
2203 | return; | 2201 | return; |
2204 | } | 2202 | } |
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 2f89d000f954..2c0ee9c2d020 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c | |||
@@ -102,20 +102,21 @@ int nmi_active; | |||
102 | (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ | 102 | (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ |
103 | P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) | 103 | P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) |
104 | 104 | ||
105 | int __init check_nmi_watchdog (void) | 105 | static int __init check_nmi_watchdog(void) |
106 | { | 106 | { |
107 | unsigned int prev_nmi_count[NR_CPUS]; | 107 | unsigned int prev_nmi_count[NR_CPUS]; |
108 | int cpu; | 108 | int cpu; |
109 | 109 | ||
110 | printk(KERN_INFO "testing NMI watchdog ... "); | 110 | if (nmi_watchdog == NMI_NONE) |
111 | return 0; | ||
112 | |||
113 | printk(KERN_INFO "Testing NMI watchdog ... "); | ||
111 | 114 | ||
112 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 115 | for (cpu = 0; cpu < NR_CPUS; cpu++) |
113 | prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; | 116 | prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; |
114 | local_irq_enable(); | 117 | local_irq_enable(); |
115 | mdelay((10*1000)/nmi_hz); // wait 10 ticks | 118 | mdelay((10*1000)/nmi_hz); // wait 10 ticks |
116 | 119 | ||
117 | /* FIXME: Only boot CPU is online at this stage. Check CPUs | ||
118 | as they come up. */ | ||
119 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 120 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
120 | #ifdef CONFIG_SMP | 121 | #ifdef CONFIG_SMP |
121 | /* Check cpu_callin_map here because that is set | 122 | /* Check cpu_callin_map here because that is set |
@@ -139,6 +140,8 @@ int __init check_nmi_watchdog (void) | |||
139 | 140 | ||
140 | return 0; | 141 | return 0; |
141 | } | 142 | } |
143 | /* This needs to happen later in boot so counters are working */ | ||
144 | late_initcall(check_nmi_watchdog); | ||
142 | 145 | ||
143 | static int __init setup_nmi_watchdog(char *str) | 146 | static int __init setup_nmi_watchdog(char *str) |
144 | { | 147 | { |
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index fd36d2f65f88..cbea7ac582e5 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c | |||
@@ -1089,9 +1089,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) | |||
1089 | } | 1089 | } |
1090 | } | 1090 | } |
1091 | 1091 | ||
1092 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1093 | check_nmi_watchdog(); | ||
1094 | |||
1095 | smpboot_setup_io_apic(); | 1092 | smpboot_setup_io_apic(); |
1096 | 1093 | ||
1097 | setup_boot_APIC_clock(); | 1094 | setup_boot_APIC_clock(); |
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 29a257295484..60be58617eb9 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c | |||
@@ -1607,7 +1607,6 @@ static inline void check_timer(void) | |||
1607 | disable_8259A_irq(0); | 1607 | disable_8259A_irq(0); |
1608 | setup_nmi(); | 1608 | setup_nmi(); |
1609 | enable_8259A_irq(0); | 1609 | enable_8259A_irq(0); |
1610 | check_nmi_watchdog(); | ||
1611 | } | 1610 | } |
1612 | return; | 1611 | return; |
1613 | } | 1612 | } |
@@ -1627,7 +1626,6 @@ static inline void check_timer(void) | |||
1627 | nmi_watchdog_default(); | 1626 | nmi_watchdog_default(); |
1628 | if (nmi_watchdog == NMI_IO_APIC) { | 1627 | if (nmi_watchdog == NMI_IO_APIC) { |
1629 | setup_nmi(); | 1628 | setup_nmi(); |
1630 | check_nmi_watchdog(); | ||
1631 | } | 1629 | } |
1632 | return; | 1630 | return; |
1633 | } | 1631 | } |
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index e00d4adec36b..61de0b34a01e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c | |||
@@ -112,17 +112,20 @@ static __init int cpu_has_lapic(void) | |||
112 | } | 112 | } |
113 | } | 113 | } |
114 | 114 | ||
115 | int __init check_nmi_watchdog (void) | 115 | static int __init check_nmi_watchdog (void) |
116 | { | 116 | { |
117 | int counts[NR_CPUS]; | 117 | int counts[NR_CPUS]; |
118 | int cpu; | 118 | int cpu; |
119 | 119 | ||
120 | if (nmi_watchdog == NMI_NONE) | ||
121 | return 0; | ||
122 | |||
120 | if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { | 123 | if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { |
121 | nmi_watchdog = NMI_NONE; | 124 | nmi_watchdog = NMI_NONE; |
122 | return -1; | 125 | return -1; |
123 | } | 126 | } |
124 | 127 | ||
125 | printk(KERN_INFO "testing NMI watchdog ... "); | 128 | printk(KERN_INFO "Testing NMI watchdog ... "); |
126 | 129 | ||
127 | for (cpu = 0; cpu < NR_CPUS; cpu++) | 130 | for (cpu = 0; cpu < NR_CPUS; cpu++) |
128 | counts[cpu] = cpu_pda[cpu].__nmi_count; | 131 | counts[cpu] = cpu_pda[cpu].__nmi_count; |
@@ -148,6 +151,8 @@ int __init check_nmi_watchdog (void) | |||
148 | 151 | ||
149 | return 0; | 152 | return 0; |
150 | } | 153 | } |
154 | /* Have this called later during boot so counters are updating */ | ||
155 | late_initcall(check_nmi_watchdog); | ||
151 | 156 | ||
152 | int __init setup_nmi_watchdog(char *str) | 157 | int __init setup_nmi_watchdog(char *str) |
153 | { | 158 | { |
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index e1de67483f38..a5810cf7b578 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h | |||
@@ -109,7 +109,6 @@ extern int APIC_init_uniprocessor (void); | |||
109 | extern void disable_APIC_timer(void); | 109 | extern void disable_APIC_timer(void); |
110 | extern void enable_APIC_timer(void); | 110 | extern void enable_APIC_timer(void); |
111 | 111 | ||
112 | extern int check_nmi_watchdog (void); | ||
113 | extern void enable_NMI_through_LVT0 (void * dummy); | 112 | extern void enable_NMI_through_LVT0 (void * dummy); |
114 | 113 | ||
115 | extern unsigned int nmi_watchdog; | 114 | extern unsigned int nmi_watchdog; |
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index c025cc3ef789..e4b1017b8b2b 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h | |||
@@ -99,7 +99,6 @@ extern void disable_APIC_timer(void); | |||
99 | extern void enable_APIC_timer(void); | 99 | extern void enable_APIC_timer(void); |
100 | extern void clustered_apic_check(void); | 100 | extern void clustered_apic_check(void); |
101 | 101 | ||
102 | extern int check_nmi_watchdog(void); | ||
103 | extern void nmi_watchdog_default(void); | 102 | extern void nmi_watchdog_default(void); |
104 | extern int setup_nmi_watchdog(char *); | 103 | extern int setup_nmi_watchdog(char *); |
105 | 104 | ||