diff options
author | Mandeep Singh Baines <msb@chromium.org> | 2011-05-23 01:10:23 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-05-23 05:58:59 -0400 |
commit | 4eec42f392043063d0f019640b4ccc2a45570002 (patch) | |
tree | 32db1c354f9c12d1275093efed8101a2bd5db232 /kernel | |
parent | 586692a5a5fc5740c8a46abc0f2365495c2d7c5f (diff) |
watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh
Before the conversion of the NMI watchdog to perf event, the
watchdog timeout was 5 seconds. Now it is 60 seconds. For my
particular application, netbooks, 5 seconds was a better
timeout. With a short timeout, we catch faults earlier and are
able to send back a panic. With a 60 second timeout, the user is
unlikely to wait and will instead hit the power button, causing
us to lose the panic info.
This change configures the NMI period to watchdog_thresh and
sets the softlockup_thresh to watchdog_thresh * 2. In addition,
watchdog_thresh was reduced to 10 seconds as suggested by Ingo
Molnar.
Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20110517071642.GF22305@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/watchdog.c | 19 |
1 files changed, 15 insertions, 4 deletions
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 60301916f62e..6e63097fa73a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
29 | 29 | ||
30 | int watchdog_enabled = 1; | 30 | int watchdog_enabled = 1; |
31 | int __read_mostly watchdog_thresh = 60; | 31 | int __read_mostly watchdog_thresh = 10; |
32 | 32 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) | |||
91 | __setup("nosoftlockup", nosoftlockup_setup); | 91 | __setup("nosoftlockup", nosoftlockup_setup); |
92 | /* */ | 92 | /* */ |
93 | 93 | ||
94 | /* | ||
95 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | ||
96 | * lockups can have false positives under extreme conditions. So we generally | ||
97 | * want a higher threshold for soft lockups than for hard lockups. So we couple | ||
98 | * the thresholds with a factor: we make the soft threshold twice the amount of | ||
99 | * time the hard threshold is. | ||
100 | */ | ||
101 | static int get_softlockup_thresh() | ||
102 | { | ||
103 | return watchdog_thresh * 2; | ||
104 | } | ||
94 | 105 | ||
95 | /* | 106 | /* |
96 | * Returns seconds, approximately. We don't need nanosecond | 107 | * Returns seconds, approximately. We don't need nanosecond |
@@ -110,7 +121,7 @@ static unsigned long get_sample_period(void) | |||
110 | * increment before the hardlockup detector generates | 121 | * increment before the hardlockup detector generates |
111 | * a warning | 122 | * a warning |
112 | */ | 123 | */ |
113 | return watchdog_thresh * (NSEC_PER_SEC / 5); | 124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); |
114 | } | 125 | } |
115 | 126 | ||
116 | /* Commands for resetting the watchdog */ | 127 | /* Commands for resetting the watchdog */ |
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) | |||
182 | unsigned long now = get_timestamp(smp_processor_id()); | 193 | unsigned long now = get_timestamp(smp_processor_id()); |
183 | 194 | ||
184 | /* Warn about unreasonable delays: */ | 195 | /* Warn about unreasonable delays: */ |
185 | if (time_after(now, touch_ts + watchdog_thresh)) | 196 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
186 | return now - touch_ts; | 197 | return now - touch_ts; |
187 | 198 | ||
188 | return 0; | 199 | return 0; |
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) | |||
359 | 370 | ||
360 | /* Try to register using hardware perf events */ | 371 | /* Try to register using hardware perf events */ |
361 | wd_attr = &wd_hw_attr; | 372 | wd_attr = &wd_hw_attr; |
362 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
363 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); | 374 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
364 | if (!IS_ERR(event)) { | 375 | if (!IS_ERR(event)) { |
365 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |