summaryrefslogtreecommitdiffstats
path: root/kernel/hung_task.c
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2018-08-22 00:55:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-22 13:52:47 -0400
commita2e514453861dd39b53b7a50b6771bd3f9852078 (patch)
tree5c5f930b2503ecd1fa5657b7fa95d897a799e6f9 /kernel/hung_task.c
parent91bc9aaf746ae41016bd6b61a48133e162542574 (diff)
kernel/hung_task.c: allow to set checking interval separately from timeout
Currently task hung checking interval is equal to timeout, as the result hung is detected anywhere between timeout and 2*timeout. This is fine for most interactive environments, but this hurts automated testing setups (syzbot). In an automated setup we need to strictly order CPU lockup < RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall is not detected as task hung and task hung is not detected as silent machine loss. The large variance in task hung detection timeout requires setting silent machine loss timeout to a very large value (e.g. if task hung is 3 mins, then silent loss need to be set to ~7 mins). The additional 3 minutes significantly reduce testing efficiency because usually we crash kernel within a minute, and this can add hours to bug localization process as it needs to do dozens of tests. Allow setting checking interval separately from timeout. This allows to set timeout to, say, 3 minutes, but checking interval to 10 secs. The interval is controlled via a new hung_task_check_interval_secs sysctl, similar to the existing hung_task_timeout_secs sysctl. The default value of 0 results in the current behavior: checking interval is equal to timeout. [akpm@linux-foundation.org: update hung_task_timeout_max's comment] Link: http://lkml.kernel.org/r/20180611111004.203513-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov <dvyukov@google.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/hung_task.c')
-rw-r--r--kernel/hung_task.c15
1 files changed, 14 insertions, 1 deletions
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 32b479468e4d..b9132d1269ef 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
40 */ 40 */
41unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; 41unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
42 42
43/*
44 * Zero (default value) means use sysctl_hung_task_timeout_secs:
45 */
46unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
47
43int __read_mostly sysctl_hung_task_warnings = 10; 48int __read_mostly sysctl_hung_task_warnings = 10;
44 49
45static int __read_mostly did_panic; 50static int __read_mostly did_panic;
@@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 103
99 if (switch_count != t->last_switch_count) { 104 if (switch_count != t->last_switch_count) {
100 t->last_switch_count = switch_count; 105 t->last_switch_count = switch_count;
106 t->last_switch_time = jiffies;
101 return; 107 return;
102 } 108 }
109 if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
110 return;
103 111
104 trace_sched_process_hang(t); 112 trace_sched_process_hang(t);
105 113
@@ -245,8 +253,13 @@ static int watchdog(void *dummy)
245 253
246 for ( ; ; ) { 254 for ( ; ; ) {
247 unsigned long timeout = sysctl_hung_task_timeout_secs; 255 unsigned long timeout = sysctl_hung_task_timeout_secs;
248 long t = hung_timeout_jiffies(hung_last_checked, timeout); 256 unsigned long interval = sysctl_hung_task_check_interval_secs;
257 long t;
249 258
259 if (interval == 0)
260 interval = timeout;
261 interval = min_t(unsigned long, interval, timeout);
262 t = hung_timeout_jiffies(hung_last_checked, interval);
250 if (t <= 0) { 263 if (t <= 0) {
251 if (!atomic_xchg(&reset_hung_task, 0)) 264 if (!atomic_xchg(&reset_hung_task, 0))
252 check_hung_uninterruptible_tasks(timeout); 265 check_hung_uninterruptible_tasks(timeout);