diff options
author | chai wen <chaiw.fnst@cn.fujitsu.com> | 2014-10-09 18:25:17 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-09 22:25:48 -0400 |
commit | b1a8de1f534337b398c7778578a56ec4f018cb27 (patch) | |
tree | f703ad9492c640ee415e156506767778c88e3026 /kernel/watchdog.c | |
parent | f775da2fc2a8e42aa49eddbf5186ac3df8961a71 (diff) |
softlockup: make detector be aware of task switch of processes hogging cpu
For now, soft lockup detector warns once for each case of process
softlockup. But the thread 'watchdog/n' may not always get the cpu at the
time slot between the task switch of two processes hogging that cpu to
reset soft_watchdog_warn.
An example would be two processes hogging the cpu. Process A causes the
softlockup warning and is killed manually by a user. Process B
immediately becomes the new process hogging the cpu preventing the
softlockup code from resetting the soft_watchdog_warn variable.
This case is a false negative of "warn only once for a process", as there
may be a different process that is going to hog the cpu. Resolve this by
saving/checking the task pointer of the hogging process and use that to
reset soft_watchdog_warn too.
[dzickus@redhat.com: update comment]
Signed-off-by: chai wen <chaiw.fnst@cn.fujitsu.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/watchdog.c')
-rw-r--r-- | kernel/watchdog.c | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..7b223b212683 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); | |||
47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
50 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | ||
50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 51 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
51 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 52 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
52 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 53 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
333 | return HRTIMER_RESTART; | 334 | return HRTIMER_RESTART; |
334 | 335 | ||
335 | /* only warn once */ | 336 | /* only warn once */ |
336 | if (__this_cpu_read(soft_watchdog_warn) == true) | 337 | if (__this_cpu_read(soft_watchdog_warn) == true) { |
338 | /* | ||
339 | * When multiple processes are causing softlockups the | ||
340 | * softlockup detector only warns on the first one | ||
341 | * because the code relies on a full quiet cycle to | ||
342 | * re-arm. The second process prevents the quiet cycle | ||
343 | * and never gets reported. Use task pointers to detect | ||
344 | * this. | ||
345 | */ | ||
346 | if (__this_cpu_read(softlockup_task_ptr_saved) != | ||
347 | current) { | ||
348 | __this_cpu_write(soft_watchdog_warn, false); | ||
349 | __touch_watchdog(); | ||
350 | } | ||
337 | return HRTIMER_RESTART; | 351 | return HRTIMER_RESTART; |
352 | } | ||
338 | 353 | ||
339 | if (softlockup_all_cpu_backtrace) { | 354 | if (softlockup_all_cpu_backtrace) { |
340 | /* Prevent multiple soft-lockup reports if one cpu is already | 355 | /* Prevent multiple soft-lockup reports if one cpu is already |
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
350 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 365 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
351 | smp_processor_id(), duration, | 366 | smp_processor_id(), duration, |
352 | current->comm, task_pid_nr(current)); | 367 | current->comm, task_pid_nr(current)); |
368 | __this_cpu_write(softlockup_task_ptr_saved, current); | ||
353 | print_modules(); | 369 | print_modules(); |
354 | print_irqtrace_events(current); | 370 | print_irqtrace_events(current); |
355 | if (regs) | 371 | if (regs) |