aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSameer Nanda <snanda@chromium.org>2012-07-30 17:40:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-30 20:25:13 -0400
commit45226e944ce071d0231949f2fea90969437cd2dc (patch)
tree6f323df1f61d028f77175525ae54948820cd09a1
parent190320c3b6640d4104650f55ff69611e050ea06b (diff)
NMI watchdog: fix for lockup detector breakage on resume
On the suspend/resume path the boot CPU does not go though an offline->online transition. This breaks the NMI detector post-resume since it depends on PMU state that is lost when the system gets suspended. Fix this by forcing a CPU offline->online transition for the lockup detector on the boot CPU during resume. To provide more context, we enable NMI watchdog on Chrome OS. We have seen several reports of systems freezing up completely which indicated that the NMI watchdog was not firing for some reason. Debugging further, we found a simple way of repro'ing system freezes -- issuing the command 'tasket 1 sh -c "echo nmilockup > /proc/breakme"' after the system has been suspended/resumed one or more times. With this patch in place, the system freeze result in panics, as expected. These panics provide a nice stack trace for us to debug the actual issue causing the freeze. [akpm@linux-foundation.org: fiddle with code comment] [akpm@linux-foundation.org: make lockup_detector_bootcpu_resume() conditional on CONFIG_SUSPEND] [akpm@linux-foundation.org: fix section errors] Signed-off-by: Sameer Nanda <snanda@chromium.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Don Zickus <dzickus@redhat.com> Cc: Mandeep Singh Baines <msb@chromium.org> Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/watchdog.c21
3 files changed, 30 insertions, 2 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e26a5e45aa6..68dcffaa62a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -334,6 +334,14 @@ static inline void lockup_detector_init(void)
334} 334}
335#endif 335#endif
336 336
337#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND)
338void lockup_detector_bootcpu_resume(void);
339#else
340static inline void lockup_detector_bootcpu_resume(void)
341{
342}
343#endif
344
337#ifdef CONFIG_DETECT_HUNG_TASK 345#ifdef CONFIG_DETECT_HUNG_TASK
338extern unsigned int sysctl_hung_task_panic; 346extern unsigned int sysctl_hung_task_panic;
339extern unsigned long sysctl_hung_task_check_count; 347extern unsigned long sysctl_hung_task_check_count;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..1da39ea248fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
178 arch_suspend_enable_irqs(); 178 arch_suspend_enable_irqs();
179 BUG_ON(irqs_disabled()); 179 BUG_ON(irqs_disabled());
180 180
181 /* Kick the lockup detector */
182 lockup_detector_bootcpu_resume();
183
181 Enable_cpus: 184 Enable_cpus:
182 enable_nonboot_cpus(); 185 enable_nonboot_cpus();
183 186
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..69add8a9da68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
575/* 575/*
576 * Create/destroy watchdog threads as CPUs come and go: 576 * Create/destroy watchdog threads as CPUs come and go:
577 */ 577 */
578static int __cpuinit 578static int
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
580{ 580{
581 int hotcpu = (unsigned long)hcpu; 581 int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
610 return NOTIFY_OK; 610 return NOTIFY_OK;
611} 611}
612 612
613static struct notifier_block __cpuinitdata cpu_nfb = { 613static struct notifier_block cpu_nfb = {
614 .notifier_call = cpu_callback 614 .notifier_call = cpu_callback
615}; 615};
616 616
617#ifdef CONFIG_SUSPEND
618/*
619 * On exit from suspend we force an offline->online transition on the boot CPU
620 * so that the PMU state that was lost while in suspended state gets set up
621 * properly for the boot CPU. This information is required for restarting the
622 * NMI watchdog.
623 */
624void lockup_detector_bootcpu_resume(void)
625{
626 void *cpu = (void *)(long)smp_processor_id();
627
628 cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
629 cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
630 cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
631}
632#endif
633
617void __init lockup_detector_init(void) 634void __init lockup_detector_init(void)
618{ 635{
619 void *cpu = (void *)(long)smp_processor_id(); 636 void *cpu = (void *)(long)smp_processor_id();