aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/nmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel/nmi.c')
-rw-r--r--arch/x86_64/kernel/nmi.c250
1 files changed, 172 insertions, 78 deletions
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index ec13eb97e8e6..31c0f2e6ac91 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
33#include <asm/msr.h> 33#include <asm/msr.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/kdebug.h> 35#include <asm/kdebug.h>
36#include <asm/local.h>
36 37
37/* 38/*
38 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
59 60
60unsigned int nmi_watchdog = NMI_DEFAULT; 61unsigned int nmi_watchdog = NMI_DEFAULT;
61static unsigned int nmi_hz = HZ; 62static unsigned int nmi_hz = HZ;
62unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 63static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
64static unsigned int nmi_p4_cccr_val;
63 65
64/* Note that these events don't tick when the CPU idles. This means 66/* Note that these events don't tick when the CPU idles. This means
65 the frequency varies with CPU load. */ 67 the frequency varies with CPU load. */
@@ -71,67 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
71#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 73#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
72#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 74#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
73 75
74#define P6_EVNTSEL0_ENABLE (1 << 22) 76#define MSR_P4_MISC_ENABLE 0x1A0
75#define P6_EVNTSEL_INT (1 << 20) 77#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
76#define P6_EVNTSEL_OS (1 << 17) 78#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
77#define P6_EVNTSEL_USR (1 << 16) 79#define MSR_P4_PERFCTR0 0x300
78#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 80#define MSR_P4_CCCR0 0x360
79#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED 81#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
82#define P4_ESCR_OS (1<<3)
83#define P4_ESCR_USR (1<<2)
84#define P4_CCCR_OVF_PMI0 (1<<26)
85#define P4_CCCR_OVF_PMI1 (1<<27)
86#define P4_CCCR_THRESHOLD(N) ((N)<<20)
87#define P4_CCCR_COMPLEMENT (1<<19)
88#define P4_CCCR_COMPARE (1<<18)
89#define P4_CCCR_REQUIRED (3<<16)
90#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
91#define P4_CCCR_ENABLE (1<<12)
92/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
93 CRU_ESCR0 (with any non-null event selector) through a complemented
94 max threshold. [IA32-Vol3, Section 14.9.9] */
95#define MSR_P4_IQ_COUNTER0 0x30C
96#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
97#define P4_NMI_IQ_CCCR0 \
98 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
99 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
100
101static __init inline int nmi_known_cpu(void)
102{
103 switch (boot_cpu_data.x86_vendor) {
104 case X86_VENDOR_AMD:
105 return boot_cpu_data.x86 == 15;
106 case X86_VENDOR_INTEL:
107 return boot_cpu_data.x86 == 15;
108 }
109 return 0;
110}
80 111
81/* Run after command line and cpu_init init, but before all other checks */ 112/* Run after command line and cpu_init init, but before all other checks */
82void __init nmi_watchdog_default(void) 113void __init nmi_watchdog_default(void)
83{ 114{
84 if (nmi_watchdog != NMI_DEFAULT) 115 if (nmi_watchdog != NMI_DEFAULT)
85 return; 116 return;
86 117 if (nmi_known_cpu())
87 /* For some reason the IO APIC watchdog doesn't work on the AMD 118 nmi_watchdog = NMI_LOCAL_APIC;
88 8111 chipset. For now switch to local APIC mode using 119 else
89 perfctr0 there. On Intel CPUs we don't have code to handle
90 the perfctr and the IO-APIC seems to work, so use that. */
91
92 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
93 nmi_watchdog = NMI_LOCAL_APIC;
94 printk(KERN_INFO
95 "Using local APIC NMI watchdog using perfctr0\n");
96 } else {
97 printk(KERN_INFO "Using IO APIC NMI watchdog\n");
98 nmi_watchdog = NMI_IO_APIC; 120 nmi_watchdog = NMI_IO_APIC;
99 }
100} 121}
101 122
102/* Why is there no CPUID flag for this? */ 123#ifdef CONFIG_SMP
103static __init int cpu_has_lapic(void) 124/* The performance counters used by NMI_LOCAL_APIC don't trigger when
125 * the CPU is idle. To make sure the NMI watchdog really ticks on all
126 * CPUs during the test make them busy.
127 */
128static __init void nmi_cpu_busy(void *data)
104{ 129{
105 switch (boot_cpu_data.x86_vendor) { 130 volatile int *endflag = data;
106 case X86_VENDOR_INTEL: 131 local_irq_enable();
107 case X86_VENDOR_AMD: 132 /* Intentionally don't use cpu_relax here. This is
108 return boot_cpu_data.x86 >= 6; 133 to make sure that the performance counter really ticks,
109 /* .... add more cpus here or find a different way to figure this out. */ 134 even if there is a simulator or similar that catches the
110 default: 135 pause instruction. On a real HT machine this is fine because
111 return 0; 136 all other CPUs are busy with "useless" delay loops and don't
112 } 137 care if they get somewhat less cycles. */
138 while (*endflag == 0)
139 barrier();
113} 140}
141#endif
114 142
115static int __init check_nmi_watchdog (void) 143int __init check_nmi_watchdog (void)
116{ 144{
145 volatile int endflag = 0;
117 int *counts; 146 int *counts;
118 int cpu; 147 int cpu;
119 148
120 if (nmi_watchdog == NMI_NONE) 149 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
121 return 0; 150 if (!counts)
151 return -1;
122 152
123 if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 153 printk(KERN_INFO "testing NMI watchdog ... ");
124 nmi_watchdog = NMI_NONE;
125 return -1;
126 }
127 154
128 counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL); 155 if (nmi_watchdog == NMI_LOCAL_APIC)
129 if (!counts) { 156 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
130 nmi_watchdog = NMI_NONE;
131 return 0;
132 }
133
134 printk(KERN_INFO "Testing NMI watchdog ... ");
135 157
136 for (cpu = 0; cpu < NR_CPUS; cpu++) 158 for (cpu = 0; cpu < NR_CPUS; cpu++)
137 counts[cpu] = cpu_pda[cpu].__nmi_count; 159 counts[cpu] = cpu_pda[cpu].__nmi_count;
@@ -139,16 +161,22 @@ static int __init check_nmi_watchdog (void)
139 mdelay((10*1000)/nmi_hz); // wait 10 ticks 161 mdelay((10*1000)/nmi_hz); // wait 10 ticks
140 162
141 for (cpu = 0; cpu < NR_CPUS; cpu++) { 163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (!cpu_online(cpu))
165 continue;
142 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 166 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
143 printk("CPU#%d: NMI appears to be stuck (%d)!\n", 167 endflag = 1;
168 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
144 cpu, 169 cpu,
170 counts[cpu],
145 cpu_pda[cpu].__nmi_count); 171 cpu_pda[cpu].__nmi_count);
146 nmi_active = 0; 172 nmi_active = 0;
147 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 173 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
174 nmi_perfctr_msr = 0;
148 kfree(counts); 175 kfree(counts);
149 return -1; 176 return -1;
150 } 177 }
151 } 178 }
179 endflag = 1;
152 printk("OK.\n"); 180 printk("OK.\n");
153 181
154 /* now that we know it works we can reduce NMI frequency to 182 /* now that we know it works we can reduce NMI frequency to
@@ -159,8 +187,6 @@ static int __init check_nmi_watchdog (void)
159 kfree(counts); 187 kfree(counts);
160 return 0; 188 return 0;
161} 189}
162/* Have this called later during boot so counters are updating */
163late_initcall(check_nmi_watchdog);
164 190
165int __init setup_nmi_watchdog(char *str) 191int __init setup_nmi_watchdog(char *str)
166{ 192{
@@ -178,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
178 204
179 if (nmi >= NMI_INVALID) 205 if (nmi >= NMI_INVALID)
180 return 0; 206 return 0;
181 nmi_watchdog = nmi; 207 nmi_watchdog = nmi;
182 return 1; 208 return 1;
183} 209}
184 210
@@ -193,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
193 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 219 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
194 break; 220 break;
195 case X86_VENDOR_INTEL: 221 case X86_VENDOR_INTEL:
196 wrmsr(MSR_IA32_EVNTSEL0, 0, 0); 222 if (boot_cpu_data.x86 == 15) {
223 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
224 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
225 }
197 break; 226 break;
198 } 227 }
199 nmi_active = -1; 228 nmi_active = -1;
@@ -261,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
261 290
262static int nmi_pm_active; /* nmi_active before suspend */ 291static int nmi_pm_active; /* nmi_active before suspend */
263 292
264static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 293static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
265{ 294{
266 nmi_pm_active = nmi_active; 295 nmi_pm_active = nmi_active;
267 disable_lapic_nmi_watchdog(); 296 disable_lapic_nmi_watchdog();
@@ -308,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
308 * Original code written by Keith Owens. 337 * Original code written by Keith Owens.
309 */ 338 */
310 339
340static void clear_msr_range(unsigned int base, unsigned int n)
341{
342 unsigned int i;
343
344 for(i = 0; i < n; ++i)
345 wrmsr(base+i, 0, 0);
346}
347
311static void setup_k7_watchdog(void) 348static void setup_k7_watchdog(void)
312{ 349{
313 int i; 350 int i;
314 unsigned int evntsel; 351 unsigned int evntsel;
315 352
316 /* No check, so can start with slow frequency */
317 nmi_hz = 1;
318
319 /* XXX should check these in EFER */
320
321 nmi_perfctr_msr = MSR_K7_PERFCTR0; 353 nmi_perfctr_msr = MSR_K7_PERFCTR0;
322 354
323 for(i = 0; i < 4; ++i) { 355 for(i = 0; i < 4; ++i) {
324 /* Simulator may not support it */ 356 /* Simulator may not support it */
325 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) 357 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
358 nmi_perfctr_msr = 0;
326 return; 359 return;
360 }
327 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 361 wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
328 } 362 }
329 363
@@ -333,12 +367,54 @@ static void setup_k7_watchdog(void)
333 | K7_NMI_EVENT; 367 | K7_NMI_EVENT;
334 368
335 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 369 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
336 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); 370 wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
337 apic_write(APIC_LVTPC, APIC_DM_NMI); 371 apic_write(APIC_LVTPC, APIC_DM_NMI);
338 evntsel |= K7_EVNTSEL_ENABLE; 372 evntsel |= K7_EVNTSEL_ENABLE;
339 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
340} 374}
341 375
376
377static int setup_p4_watchdog(void)
378{
379 unsigned int misc_enable, dummy;
380
381 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
382 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
383 return 0;
384
385 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
386 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
387#ifdef CONFIG_SMP
388 if (smp_num_siblings == 2)
389 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
390#endif
391
392 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
393 clear_msr_range(0x3F1, 2);
394 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
395 docs doesn't fully define it, so leave it alone for now. */
396 if (boot_cpu_data.x86_model >= 0x3) {
397 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
398 clear_msr_range(0x3A0, 26);
399 clear_msr_range(0x3BC, 3);
400 } else {
401 clear_msr_range(0x3A0, 31);
402 }
403 clear_msr_range(0x3C0, 6);
404 clear_msr_range(0x3C8, 6);
405 clear_msr_range(0x3E0, 2);
406 clear_msr_range(MSR_P4_CCCR0, 18);
407 clear_msr_range(MSR_P4_PERFCTR0, 18);
408
409 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
410 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
411 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
412 wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
413 apic_write(APIC_LVTPC, APIC_DM_NMI);
414 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
415 return 1;
416}
417
342void setup_apic_nmi_watchdog(void) 418void setup_apic_nmi_watchdog(void)
343{ 419{
344 switch (boot_cpu_data.x86_vendor) { 420 switch (boot_cpu_data.x86_vendor) {
@@ -349,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
349 return; 425 return;
350 setup_k7_watchdog(); 426 setup_k7_watchdog();
351 break; 427 break;
428 case X86_VENDOR_INTEL:
429 if (boot_cpu_data.x86 != 15)
430 return;
431 if (!setup_p4_watchdog())
432 return;
433 break;
434
352 default: 435 default:
353 return; 436 return;
354 } 437 }
@@ -363,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
363 * 446 *
364 * as these watchdog NMI IRQs are generated on every CPU, we only 447 * as these watchdog NMI IRQs are generated on every CPU, we only
365 * have to check the current processor. 448 * have to check the current processor.
366 *
367 * since NMIs don't listen to _any_ locks, we have to be extremely
368 * careful not to rely on unsafe variables. The printk might lock
369 * up though, so we have to break up any console locks first ...
370 * [when there will be more tty-related locks, break them up
371 * here too!]
372 */ 449 */
373 450
374static unsigned int 451static DEFINE_PER_CPU(unsigned, last_irq_sum);
375 last_irq_sums [NR_CPUS], 452static DEFINE_PER_CPU(local_t, alert_counter);
376 alert_counter [NR_CPUS]; 453static DEFINE_PER_CPU(int, nmi_touch);
377 454
378void touch_nmi_watchdog (void) 455void touch_nmi_watchdog (void)
379{ 456{
380 int i; 457 int i;
381 458
382 /* 459 /*
383 * Just reset the alert counters, (other CPUs might be 460 * Tell other CPUs to reset their alert counters. We cannot
384 * spinning on locks we hold): 461 * do it ourselves because the alert count increase is not
462 * atomic.
385 */ 463 */
386 for (i = 0; i < NR_CPUS; i++) 464 for (i = 0; i < NR_CPUS; i++)
387 alert_counter[i] = 0; 465 per_cpu(nmi_touch, i) = 1;
388} 466}
389 467
390void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 468void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
391{ 469{
392 int sum, cpu; 470 int sum;
471 int touched = 0;
393 472
394 cpu = safe_smp_processor_id();
395 sum = read_pda(apic_timer_irqs); 473 sum = read_pda(apic_timer_irqs);
396 if (last_irq_sums[cpu] == sum) { 474 if (__get_cpu_var(nmi_touch)) {
475 __get_cpu_var(nmi_touch) = 0;
476 touched = 1;
477 }
478 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
397 /* 479 /*
398 * Ayiee, looks like this CPU is stuck ... 480 * Ayiee, looks like this CPU is stuck ...
399 * wait a few IRQs (5 seconds) before doing the oops ... 481 * wait a few IRQs (5 seconds) before doing the oops ...
400 */ 482 */
401 alert_counter[cpu]++; 483 local_inc(&__get_cpu_var(alert_counter));
402 if (alert_counter[cpu] == 5*nmi_hz) { 484 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
403 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 485 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
404 == NOTIFY_STOP) { 486 == NOTIFY_STOP) {
405 alert_counter[cpu] = 0; 487 local_set(&__get_cpu_var(alert_counter), 0);
406 return; 488 return;
407 } 489 }
408 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); 490 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
409 } 491 }
410 } else { 492 } else {
411 last_irq_sums[cpu] = sum; 493 __get_cpu_var(last_irq_sum) = sum;
412 alert_counter[cpu] = 0; 494 local_set(&__get_cpu_var(alert_counter), 0);
413 } 495 }
414 if (nmi_perfctr_msr) 496 if (nmi_perfctr_msr) {
497 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
498 /*
499 * P4 quirks:
500 * - An overflown perfctr will assert its interrupt
501 * until the OVF flag in its CCCR is cleared.
502 * - LVTPC is masked on interrupt and must be
503 * unmasked by the LVTPC handler.
504 */
505 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
506 apic_write(APIC_LVTPC, APIC_DM_NMI);
507 }
415 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); 508 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
509 }
416} 510}
417 511
418static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 512static int dummy_nmi_callback(struct pt_regs * regs, int cpu)