aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/nmi.c
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-05-17 00:53:34 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-05-17 10:59:16 -0400
commit751521149a05e308d863d01ced61080ce1a2ec99 (patch)
treecb8312f502b257cc5d9a5a478acf35c5a5821665 /arch/x86_64/kernel/nmi.c
parentf3c5f5e7eeaf7c68ecc1d37200cd1ade0b3da7b9 (diff)
[PATCH] x86_64: Collected NMI watchdog fixes.
Collected NMI watchdog fixes. - Fix call of check_nmi_watchdog - Remove earlier move of check_nmi_watchdog to later. It does not fix the race it was supposed to fix fully. - Remove unused P6 definitions - Add support for performance counter based watchdog on P4 systems. This allows to run it only once per second, which saves some CPU time. Previously it would run at 1000Hz, which was too much. Code ported from i386 Make this the default on Intel systems. - Use check_nmi_watchdog with local APIC based nmi - Fix race in touch_nmi_watchdog - Fix bug that caused incorrect performance counters to be programmed in a few cases on K8. - Remove useless check for local APIC - Use local_t and per_cpu variables for per CPU data. - Keep other CPUs busy during check_nmi_watchdog to make sure they really tick when in lapic mode. - Only check CPUs that are actually online. - Various other fixes. - Fix fallback path when MSRs are unimplemented Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel/nmi.c')
-rw-r--r--arch/x86_64/kernel/nmi.c250
1 files changed, 172 insertions, 78 deletions
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index ec13eb97e8e6..31c0f2e6ac91 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
33#include <asm/msr.h> 33#include <asm/msr.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h>
36 37
37/* 38/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
59 60
60unsigned int nmi_watchdog = NMI_DEFAULT; 61unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ; 62static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 63static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
64static unsigned int nmi_p4_cccr_val;
63 65
64/* Note that these events don't tick when the CPU idles. This means 66/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */ 67 the frequency varies with CPU load. */
@@ -71,67 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 73#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 74#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73 75
74#define P6_EVNTSEL0_ENABLE (1 << 22) 76#define MSR_P4_MISC_ENABLE 0x1A0
75#define P6_EVNTSEL_INT (1 << 20) 77#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
76#define P6_EVNTSEL_OS (1 << 17) 78#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
77#define P6_EVNTSEL_USR (1 << 16) 79#define MSR_P4_PERFCTR0 0x300
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 80#define MSR_P4_CCCR0 0x360
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED 81#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
82#define P4_ESCR_OS (1<<3)
83#define P4_ESCR_USR (1<<2)
84#define P4_CCCR_OVF_PMI0 (1<<26)
85#define P4_CCCR_OVF_PMI1 (1<<27)
86#define P4_CCCR_THRESHOLD(N) ((N)<<20)
87#define P4_CCCR_COMPLEMENT (1<<19)
88#define P4_CCCR_COMPARE (1<<18)
89#define P4_CCCR_REQUIRED (3<<16)
90#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
91#define P4_CCCR_ENABLE (1<<12)
92/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
93 CRU_ESCR0 (with any non-null event selector) through a complemented
94 max threshold. [IA32-Vol3, Section 14.9.9] */
95#define MSR_P4_IQ_COUNTER0 0x30C
96#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
97#define P4_NMI_IQ_CCCR0 \
98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
100
101static __init inline int nmi_known_cpu(void)
102{
103 switch (boot_cpu_data.x86_vendor) {
104 case X86_VENDOR_AMD:
105 return boot_cpu_data.x86 == 15;
106 case X86_VENDOR_INTEL:
107 return boot_cpu_data.x86 == 15;
108 }
109 return 0;
110}
80 111
81/* Run after command line and cpu_init init, but before all other checks */ 112/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void) 113void __init nmi_watchdog_default(void)
83{ 114{
84 if (nmi_watchdog != NMI_DEFAULT) 115 if (nmi_watchdog != NMI_DEFAULT)
85 return; 116 return;
86 117 if (nmi_known_cpu())
87 /* For some reason the IO APIC watchdog doesn't work on the AMD 118 nmi_watchdog = NMI_LOCAL_APIC;
88 8111 chipset. For now switch to local APIC mode using 119 else
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC; 120 nmi_watchdog = NMI_IO_APIC;
99 }
100} 121}
101 122
102/* Why is there no CPUID flag for this? */ 123#ifdef CONFIG_SMP
103static __init int cpu_has_lapic(void) 124/* The performance counters used by NMI_LOCAL_APIC don't trigger when
125 * the CPU is idle. To make sure the NMI watchdog really ticks on all
126 * CPUs during the test make them busy.
127 */
128static __init void nmi_cpu_busy(void *data)
104{ 129{
105 switch (boot_cpu_data.x86_vendor) { 130 volatile int *endflag = data;
106 case X86_VENDOR_INTEL: 131 local_irq_enable();
107 case X86_VENDOR_AMD: 132 /* Intentionally don't use cpu_relax here. This is
108 return boot_cpu_data.x86 >= 6; 133 to make sure that the performance counter really ticks,
109 /* .... add more cpus here or find a different way to figure this out. */ 134 even if there is a simulator or similar that catches the
110 default: 135 pause instruction. On a real HT machine this is fine because
111 return 0; 136 all other CPUs are busy with "useless" delay loops and don't
112 } 137 care if they get somewhat less cycles. */
138 while (*endflag == 0)
139 barrier();
113} 140}
141#endif
114 142
115static int __init check_nmi_watchdog (void) 143int __init check_nmi_watchdog (void)
116{ 144{
145 volatile int endflag = 0;
117 int *counts; 146 int *counts;
118 int cpu; 147 int cpu;
119 148
120 if (nmi_watchdog == NMI_NONE) 149 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
121 return 0; 150 if (!counts)
151 return -1;
122 152
123 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 153 printk(KERN_INFO "testing NMI watchdog ... ");
124 nmi_watchdog = NMI_NONE;
125 return -1;
126 }
127 154
128 counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL); 155 if (nmi_watchdog == NMI_LOCAL_APIC)
129 if (!counts) { 156 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
130 nmi_watchdog = NMI_NONE;
131 return 0;
132 }
133
134 printk(KERN_INFO "Testing NMI watchdog ... ");
135 157
136 for (cpu = 0; cpu < NR_CPUS; cpu++) 158 for (cpu = 0; cpu < NR_CPUS; cpu++)
137 counts[cpu] = cpu_pda[cpu].__nmi_count; 159 counts[cpu] = cpu_pda[cpu].__nmi_count;
@@ -139,16 +161,22 @@ static int __init check_nmi_watchdog (void)
139 mdelay((10*1000)/nmi_hz); // wait 10 ticks 161 mdelay((10*1000)/nmi_hz); // wait 10 ticks
140 162
141 for (cpu = 0; cpu < NR_CPUS; cpu++) { 163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (!cpu_online(cpu))
165 continue;
142 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 166 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
143 printk("CPU#%d: NMI appears to be stuck (%d)!\n", 167 endflag = 1;
168 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
144 cpu, 169 cpu,
170 counts[cpu],
145 cpu_pda[cpu].__nmi_count); 171 cpu_pda[cpu].__nmi_count);
146 nmi_active = 0; 172 nmi_active = 0;
147 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 173 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
174 nmi_perfctr_msr = 0;
148 kfree(counts); 175 kfree(counts);
149 return -1; 176 return -1;
150 } 177 }
151 } 178 }
179 endflag = 1;
152 printk("OK.\n"); 180 printk("OK.\n");
153 181
154 /* now that we know it works we can reduce NMI frequency to 182 /* now that we know it works we can reduce NMI frequency to
@@ -159,8 +187,6 @@ static int __init check_nmi_watchdog (void)
159 kfree(counts); 187 kfree(counts);
160 return 0; 188 return 0;
161} 189}
162/* Have this called later during boot so counters are updating */
163late_initcall(check_nmi_watchdog);
164 190
165int __init setup_nmi_watchdog(char *str) 191int __init setup_nmi_watchdog(char *str)
166{ 192{
@@ -178,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
178 204
179 if (nmi >= NMI_INVALID) 205 if (nmi >= NMI_INVALID)
180 return 0; 206 return 0;
181 nmi_watchdog = nmi; 207 nmi_watchdog = nmi;
182 return 1; 208 return 1;
183} 209}
184 210
@@ -193,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
193 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 219 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
194 break; 220 break;
195 case X86_VENDOR_INTEL: 221 case X86_VENDOR_INTEL:
196 wrmsr(MSR_IA32_EVNTSEL0, 0, 0); 222 if (boot_cpu_data.x86 == 15) {
223 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
224 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
225 }
197 break; 226 break;
198 } 227 }
199 nmi_active = -1; 228 nmi_active = -1;
@@ -261,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
261 290
262static int nmi_pm_active; /* nmi_active before suspend */ 291static int nmi_pm_active; /* nmi_active before suspend */
263 292
264static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 293static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
265{ 294{
266 nmi_pm_active = nmi_active; 295 nmi_pm_active = nmi_active;
267 disable_lapic_nmi_watchdog(); 296 disable_lapic_nmi_watchdog();
@@ -308,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
308 * Original code written by Keith Owens. 337 * Original code written by Keith Owens.
309 */ 338 */
310 339
340static void clear_msr_range(unsigned int base, unsigned int n)
341{
342 unsigned int i;
343
344 for(i = 0; i < n; ++i)
345 wrmsr(base+i, 0, 0);
346}
347
311static void setup_k7_watchdog(void) 348static void setup_k7_watchdog(void)
312{ 349{
313 int i; 350 int i;
314 unsigned int evntsel; 351 unsigned int evntsel;
315 352
316 /* No check, so can start with slow frequency */
317 nmi_hz = 1;
318
319 /* XXX should check these in EFER */
320
321 nmi_perfctr_msr = MSR_K7_PERFCTR0; 353 nmi_perfctr_msr = MSR_K7_PERFCTR0;
322 354
323 for(i = 0; i < 4; ++i) { 355 for(i = 0; i < 4; ++i) {
324 /* Simulator may not support it */ 356 /* Simulator may not support it */
325 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) 357 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
358 nmi_perfctr_msr = 0;
326 return; 359 return;
360 }
327 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 361 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
328 } 362 }
329 363
@@ -333,12 +367,54 @@ static void setup_k7_watchdog(void)
333 | K7_NMI_EVENT; 367 | K7_NMI_EVENT;
334 368
335 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 369 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
336 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); 370 wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
337 apic_write(APIC_LVTPC, APIC_DM_NMI); 371 apic_write(APIC_LVTPC, APIC_DM_NMI);
338 evntsel |= K7_EVNTSEL_ENABLE; 372 evntsel |= K7_EVNTSEL_ENABLE;
339 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
340} 374}
341 375
376
377static int setup_p4_watchdog(void)
378{
379 unsigned int misc_enable, dummy;
380
381 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
382 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
383 return 0;
384
385 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
386 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
387#ifdef CONFIG_SMP
388 if (smp_num_siblings == 2)
389 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
390#endif
391
392 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
393 clear_msr_range(0x3F1, 2);
394 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
395 docs doesn't fully define it, so leave it alone for now. */
396 if (boot_cpu_data.x86_model >= 0x3) {
397 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
398 clear_msr_range(0x3A0, 26);
399 clear_msr_range(0x3BC, 3);
400 } else {
401 clear_msr_range(0x3A0, 31);
402 }
403 clear_msr_range(0x3C0, 6);
404 clear_msr_range(0x3C8, 6);
405 clear_msr_range(0x3E0, 2);
406 clear_msr_range(MSR_P4_CCCR0, 18);
407 clear_msr_range(MSR_P4_PERFCTR0, 18);
408
409 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
410 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
411 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
412 wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
413 apic_write(APIC_LVTPC, APIC_DM_NMI);
414 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
415 return 1;
416}
417
342void setup_apic_nmi_watchdog(void) 418void setup_apic_nmi_watchdog(void)
343{ 419{
344 switch (boot_cpu_data.x86_vendor) { 420 switch (boot_cpu_data.x86_vendor) {
@@ -349,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
349 return; 425 return;
350 setup_k7_watchdog(); 426 setup_k7_watchdog();
351 break; 427 break;
428 case X86_VENDOR_INTEL:
429 if (boot_cpu_data.x86 != 15)
430 return;
431 if (!setup_p4_watchdog())
432 return;
433 break;
434
352 default: 435 default:
353 return; 436 return;
354 } 437 }
@@ -363,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
363 * 446 *
364 * as these watchdog NMI IRQs are generated on every CPU, we only 447 * as these watchdog NMI IRQs are generated on every CPU, we only
365 * have to check the current processor. 448 * have to check the current processor.
366 *
367 * since NMIs don't listen to _any_ locks, we have to be extremely
368 * careful not to rely on unsafe variables. The printk might lock
369 * up though, so we have to break up any console locks first ...
370 * [when there will be more tty-related locks, break them up
371 * here too!]
372 */ 449 */
373 450
374static unsigned int 451static DEFINE_PER_CPU(unsigned, last_irq_sum);
375 last_irq_sums [NR_CPUS], 452static DEFINE_PER_CPU(local_t, alert_counter);
376 alert_counter [NR_CPUS]; 453static DEFINE_PER_CPU(int, nmi_touch);
377 454
378void touch_nmi_watchdog (void) 455void touch_nmi_watchdog (void)
379{ 456{
380 int i; 457 int i;
381 458
382 /* 459 /*
383 * Just reset the alert counters, (other CPUs might be 460 * Tell other CPUs to reset their alert counters. We cannot
384 * spinning on locks we hold): 461 * do it ourselves because the alert count increase is not
462 * atomic.
385 */ 463 */
386 for (i = 0; i < NR_CPUS; i++) 464 for (i = 0; i < NR_CPUS; i++)
387 alert_counter[i] = 0; 465 per_cpu(nmi_touch, i) = 1;
388} 466}
389 467
390void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 468void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
391{ 469{
392 int sum, cpu; 470 int sum;
471 int touched = 0;
393 472
394 cpu = safe_smp_processor_id();
395 sum = read_pda(apic_timer_irqs); 473 sum = read_pda(apic_timer_irqs);
396 if (last_irq_sums[cpu] == sum) { 474 if (__get_cpu_var(nmi_touch)) {
475 __get_cpu_var(nmi_touch) = 0;
476 touched = 1;
477 }
478 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
397 /* 479 /*
398 * Ayiee, looks like this CPU is stuck ... 480 * Ayiee, looks like this CPU is stuck ...
399 * wait a few IRQs (5 seconds) before doing the oops ... 481 * wait a few IRQs (5 seconds) before doing the oops ...
400 */ 482 */
401 alert_counter[cpu]++; 483 local_inc(&__get_cpu_var(alert_counter));
402 if (alert_counter[cpu] == 5*nmi_hz) { 484 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
403 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 485 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
404 == NOTIFY_STOP) { 486 == NOTIFY_STOP) {
405 alert_counter[cpu] = 0; 487 local_set(&__get_cpu_var(alert_counter), 0);
406 return; 488 return;
407 } 489 }
408 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); 490 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
409 } 491 }
410 } else { 492 } else {
411 last_irq_sums[cpu] = sum; 493 __get_cpu_var(last_irq_sum) = sum;
412 alert_counter[cpu] = 0; 494 local_set(&__get_cpu_var(alert_counter), 0);
413 } 495 }
414 if (nmi_perfctr_msr) 496 if (nmi_perfctr_msr) {
497 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
498 /*
499 * P4 quirks:
500 * - An overflown perfctr will assert its interrupt
501 * until the OVF flag in its CCCR is cleared.
502 * - LVTPC is masked on interrupt and must be
503 * unmasked by the LVTPC handler.
504 */
505 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
506 apic_write(APIC_LVTPC, APIC_DM_NMI);
507 }
415 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); 508 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
509 }
416} 510}
417 511
418static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 512static int dummy_nmi_callback(struct pt_regs * regs, int cpu)