aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack F Vogel <jfv@bluesong.net>2005-05-01 11:58:48 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-05-01 11:58:48 -0400
commit67701ae9767534534d3710664037dfde2cc04935 (patch)
tree6adb8d33585f8eee20794827c79e40991aeeaee5
parentfd51f666fa591294bd7462447512666e61c56ea0 (diff)
[PATCH] check nmi watchdog is broken
A bug against an xSeries system showed up recently noting that the check_nmi_watchdog() test was failing. I have been investigating it and discovered in both i386 and x86_64 the recent change to the routine to use the cpu_callin_map has uncovered a problem. Prior to that change, on an SMP box, the test was trivally passing because all cpu's were found to not yet be online, but now with the callin_map they are discovered, it goes on to test the counter and they have not yet begun to increment, so it announces a CPU is stuck and bails out. On all the systems I have access to test, the announcement of failure is also bougs... by the time you can login and check /proc/interrupts, the NMI count is happily incrementing on all CPUs. Its just that the test is being done too early. I have tried moving the call to the test around a bit, and it was always too early. I finally hit on this proposed solution, it delays the routine via a late_initcall(), seems like the right solution to me. Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/apic.c2
-rw-r--r--arch/i386/kernel/io_apic.c2
-rw-r--r--arch/i386/kernel/nmi.c11
-rw-r--r--arch/i386/kernel/smpboot.c3
-rw-r--r--arch/x86_64/kernel/io_apic.c2
-rw-r--r--arch/x86_64/kernel/nmi.c9
-rw-r--r--include/asm-i386/apic.h1
-rw-r--r--include/asm-x86_64/apic.h1
8 files changed, 14 insertions, 17 deletions
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index e3879f7625c2..d509836b70c3 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -1265,8 +1265,6 @@ int __init APIC_init_uniprocessor (void)
1265 1265
1266 setup_local_APIC(); 1266 setup_local_APIC();
1267 1267
1268 if (nmi_watchdog == NMI_LOCAL_APIC)
1269 check_nmi_watchdog();
1270#ifdef CONFIG_X86_IO_APIC 1268#ifdef CONFIG_X86_IO_APIC
1271 if (smp_found_config) 1269 if (smp_found_config)
1272 if (!skip_ioapic_setup && nr_ioapics) 1270 if (!skip_ioapic_setup && nr_ioapics)
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 5e0d55be5435..7a324e8b86f9 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -2175,7 +2175,6 @@ static inline void check_timer(void)
2175 disable_8259A_irq(0); 2175 disable_8259A_irq(0);
2176 setup_nmi(); 2176 setup_nmi();
2177 enable_8259A_irq(0); 2177 enable_8259A_irq(0);
2178 check_nmi_watchdog();
2179 } 2178 }
2180 return; 2179 return;
2181 } 2180 }
@@ -2198,7 +2197,6 @@ static inline void check_timer(void)
2198 add_pin_to_irq(0, 0, pin2); 2197 add_pin_to_irq(0, 0, pin2);
2199 if (nmi_watchdog == NMI_IO_APIC) { 2198 if (nmi_watchdog == NMI_IO_APIC) {
2200 setup_nmi(); 2199 setup_nmi();
2201 check_nmi_watchdog();
2202 } 2200 }
2203 return; 2201 return;
2204 } 2202 }
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 2f89d000f954..2c0ee9c2d020 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -102,20 +102,21 @@ int nmi_active;
102 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 102 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
103 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 103 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
104 104
105int __init check_nmi_watchdog (void) 105static int __init check_nmi_watchdog(void)
106{ 106{
107 unsigned int prev_nmi_count[NR_CPUS]; 107 unsigned int prev_nmi_count[NR_CPUS];
108 int cpu; 108 int cpu;
109 109
110 printk(KERN_INFO "testing NMI watchdog ... "); 110 if (nmi_watchdog == NMI_NONE)
111 return 0;
112
113 printk(KERN_INFO "Testing NMI watchdog ... ");
111 114
112 for (cpu = 0; cpu < NR_CPUS; cpu++) 115 for (cpu = 0; cpu < NR_CPUS; cpu++)
113 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; 116 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
114 local_irq_enable(); 117 local_irq_enable();
115 mdelay((10*1000)/nmi_hz); // wait 10 ticks 118 mdelay((10*1000)/nmi_hz); // wait 10 ticks
116 119
117 /* FIXME: Only boot CPU is online at this stage. Check CPUs
118 as they come up. */
119 for (cpu = 0; cpu < NR_CPUS; cpu++) { 120 for (cpu = 0; cpu < NR_CPUS; cpu++) {
120#ifdef CONFIG_SMP 121#ifdef CONFIG_SMP
121 /* Check cpu_callin_map here because that is set 122 /* Check cpu_callin_map here because that is set
@@ -139,6 +140,8 @@ int __init check_nmi_watchdog (void)
139 140
140 return 0; 141 return 0;
141} 142}
143/* This needs to happen later in boot so counters are working */
144late_initcall(check_nmi_watchdog);
142 145
143static int __init setup_nmi_watchdog(char *str) 146static int __init setup_nmi_watchdog(char *str)
144{ 147{
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index fd36d2f65f88..cbea7ac582e5 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1089,9 +1089,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1089 } 1089 }
1090 } 1090 }
1091 1091
1092 if (nmi_watchdog == NMI_LOCAL_APIC)
1093 check_nmi_watchdog();
1094
1095 smpboot_setup_io_apic(); 1092 smpboot_setup_io_apic();
1096 1093
1097 setup_boot_APIC_clock(); 1094 setup_boot_APIC_clock();
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 29a257295484..60be58617eb9 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1607,7 +1607,6 @@ static inline void check_timer(void)
1607 disable_8259A_irq(0); 1607 disable_8259A_irq(0);
1608 setup_nmi(); 1608 setup_nmi();
1609 enable_8259A_irq(0); 1609 enable_8259A_irq(0);
1610 check_nmi_watchdog();
1611 } 1610 }
1612 return; 1611 return;
1613 } 1612 }
@@ -1627,7 +1626,6 @@ static inline void check_timer(void)
1627 nmi_watchdog_default(); 1626 nmi_watchdog_default();
1628 if (nmi_watchdog == NMI_IO_APIC) { 1627 if (nmi_watchdog == NMI_IO_APIC) {
1629 setup_nmi(); 1628 setup_nmi();
1630 check_nmi_watchdog();
1631 } 1629 }
1632 return; 1630 return;
1633 } 1631 }
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index e00d4adec36b..61de0b34a01e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -112,17 +112,20 @@ static __init int cpu_has_lapic(void)
112 } 112 }
113} 113}
114 114
115int __init check_nmi_watchdog (void) 115static int __init check_nmi_watchdog (void)
116{ 116{
117 int counts[NR_CPUS]; 117 int counts[NR_CPUS];
118 int cpu; 118 int cpu;
119 119
120 if (nmi_watchdog == NMI_NONE)
121 return 0;
122
120 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 123 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) {
121 nmi_watchdog = NMI_NONE; 124 nmi_watchdog = NMI_NONE;
122 return -1; 125 return -1;
123 } 126 }
124 127
125 printk(KERN_INFO "testing NMI watchdog ... "); 128 printk(KERN_INFO "Testing NMI watchdog ... ");
126 129
127 for (cpu = 0; cpu < NR_CPUS; cpu++) 130 for (cpu = 0; cpu < NR_CPUS; cpu++)
128 counts[cpu] = cpu_pda[cpu].__nmi_count; 131 counts[cpu] = cpu_pda[cpu].__nmi_count;
@@ -148,6 +151,8 @@ int __init check_nmi_watchdog (void)
148 151
149 return 0; 152 return 0;
150} 153}
154/* Have this called later during boot so counters are updating */
155late_initcall(check_nmi_watchdog);
151 156
152int __init setup_nmi_watchdog(char *str) 157int __init setup_nmi_watchdog(char *str)
153{ 158{
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index e1de67483f38..a5810cf7b578 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -109,7 +109,6 @@ extern int APIC_init_uniprocessor (void);
109extern void disable_APIC_timer(void); 109extern void disable_APIC_timer(void);
110extern void enable_APIC_timer(void); 110extern void enable_APIC_timer(void);
111 111
112extern int check_nmi_watchdog (void);
113extern void enable_NMI_through_LVT0 (void * dummy); 112extern void enable_NMI_through_LVT0 (void * dummy);
114 113
115extern unsigned int nmi_watchdog; 114extern unsigned int nmi_watchdog;
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index c025cc3ef789..e4b1017b8b2b 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -99,7 +99,6 @@ extern void disable_APIC_timer(void);
99extern void enable_APIC_timer(void); 99extern void enable_APIC_timer(void);
100extern void clustered_apic_check(void); 100extern void clustered_apic_check(void);
101 101
102extern int check_nmi_watchdog(void);
103extern void nmi_watchdog_default(void); 102extern void nmi_watchdog_default(void);
104extern int setup_nmi_watchdog(char *); 103extern int setup_nmi_watchdog(char *);
105 104