aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMandeep Singh Baines <msb@chromium.org>2011-05-23 01:10:23 -0400
committerIngo Molnar <mingo@elte.hu>2011-05-23 05:58:59 -0400
commit4eec42f392043063d0f019640b4ccc2a45570002 (patch)
tree32db1c354f9c12d1275093efed8101a2bd5db232
parent586692a5a5fc5740c8a46abc0f2365495c2d7c5f (diff)
watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh
Before the conversion of the NMI watchdog to perf event, the watchdog timeout was 5 seconds. Now it is 60 seconds. For my particular application, netbooks, 5 seconds was a better timeout. With a short timeout, we catch faults earlier and are able to send back a panic. With a 60 second timeout, the user is unlikely to wait and will instead hit the power button, causing us to lose the panic info. This change configures the NMI period to watchdog_thresh and sets the softlockup_thresh to watchdog_thresh * 2. In addition, watchdog_thresh was reduced to 10 seconds as suggested by Ingo Molnar. Signed-off-by: Mandeep Singh Baines <msb@chromium.org> Cc: Marcin Slusarz <marcin.slusarz@gmail.com> Cc: Don Zickus <dzickus@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <20110517071642.GF22305@elte.hu>
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c4
-rw-r--r--include/linux/nmi.h2
-rw-r--r--kernel/watchdog.c19
3 files changed, 18 insertions, 7 deletions
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb6..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20 20
21#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
22u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(int watchdog_thresh)
23{ 23{
24 return (u64)(cpu_khz) * 1000 * 60; 24 return (u64)(cpu_khz) * 1000 * watchdog_thresh;
25} 25}
26#endif 26#endif
27 27
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5317b8b2198f..2d304efc89df 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void)
45 45
46#ifdef CONFIG_LOCKUP_DETECTOR 46#ifdef CONFIG_LOCKUP_DETECTOR
47int hw_nmi_is_cpu_stuck(struct pt_regs *); 47int hw_nmi_is_cpu_stuck(struct pt_regs *);
48u64 hw_nmi_get_sample_period(void); 48u64 hw_nmi_get_sample_period(int watchdog_thresh);
49extern int watchdog_enabled; 49extern int watchdog_enabled;
50extern int watchdog_thresh; 50extern int watchdog_thresh;
51struct ctl_table; 51struct ctl_table;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 60301916f62e..6e63097fa73a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled = 1; 30int watchdog_enabled = 1;
31int __read_mostly watchdog_thresh = 60; 31int __read_mostly watchdog_thresh = 10;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
91__setup("nosoftlockup", nosoftlockup_setup); 91__setup("nosoftlockup", nosoftlockup_setup);
92/* */ 92/* */
93 93
94/*
95 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
96 * lockups can have false positives under extreme conditions. So we generally
97 * want a higher threshold for soft lockups than for hard lockups. So we couple
98 * the thresholds with a factor: we make the soft threshold twice the amount of
99 * time the hard threshold is.
100 */
101static int get_softlockup_thresh()
102{
103 return watchdog_thresh * 2;
104}
94 105
95/* 106/*
96 * Returns seconds, approximately. We don't need nanosecond 107 * Returns seconds, approximately. We don't need nanosecond
@@ -110,7 +121,7 @@ static unsigned long get_sample_period(void)
110 * increment before the hardlockup detector generates 121 * increment before the hardlockup detector generates
111 * a warning 122 * a warning
112 */ 123 */
113 return watchdog_thresh * (NSEC_PER_SEC / 5); 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
114} 125}
115 126
116/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
182 unsigned long now = get_timestamp(smp_processor_id()); 193 unsigned long now = get_timestamp(smp_processor_id());
183 194
184 /* Warn about unreasonable delays: */ 195 /* Warn about unreasonable delays: */
185 if (time_after(now, touch_ts + watchdog_thresh)) 196 if (time_after(now, touch_ts + get_softlockup_thresh()))
186 return now - touch_ts; 197 return now - touch_ts;
187 198
188 return 0; 199 return 0;
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)
359 370
360 /* Try to register using hardware perf events */ 371 /* Try to register using hardware perf events */
361 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
362 wd_attr->sample_period = hw_nmi_get_sample_period(); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
363 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
364 if (!IS_ERR(event)) { 375 if (!IS_ERR(event)) {
365 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");