aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/watchdog.c
diff options
context:
space:
mode:
authorchai wen <chaiw.fnst@cn.fujitsu.com>2014-10-09 18:25:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-09 22:25:48 -0400
commitb1a8de1f534337b398c7778578a56ec4f018cb27 (patch)
treef703ad9492c640ee415e156506767778c88e3026 /kernel/watchdog.c
parentf775da2fc2a8e42aa49eddbf5186ac3df8961a71 (diff)
softlockup: make detector be aware of task switch of processes hogging cpu
For now, soft lockup detector warns once for each case of process softlockup. But the thread 'watchdog/n' may not always get the cpu at the time slot between the task switch of two processes hogging that cpu to reset soft_watchdog_warn. An example would be two processes hogging the cpu. Process A causes the softlockup warning and is killed manually by a user. Process B immediately becomes the new process hogging the cpu preventing the softlockup code from resetting the soft_watchdog_warn variable. This case is a false negative of "warn only once for a process", as there may be a different process that is going to hog the cpu. Resolve this by saving/checking the task pointer of the hogging process and use that to reset soft_watchdog_warn too. [dzickus@redhat.com: update comment] Signed-off-by: chai wen <chaiw.fnst@cn.fujitsu.com> Signed-off-by: Don Zickus <dzickus@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/watchdog.c')
-rw-r--r--kernel/watchdog.c18
1 files changed, 17 insertions, 1 deletions
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..7b223b212683 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
47static DEFINE_PER_CPU(bool, soft_watchdog_warn); 47static DEFINE_PER_CPU(bool, soft_watchdog_warn);
48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); 48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); 49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
50static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 51#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static DEFINE_PER_CPU(bool, hard_watchdog_warn); 52static DEFINE_PER_CPU(bool, hard_watchdog_warn);
52static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 53static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
333 return HRTIMER_RESTART; 334 return HRTIMER_RESTART;
334 335
335 /* only warn once */ 336 /* only warn once */
336 if (__this_cpu_read(soft_watchdog_warn) == true) 337 if (__this_cpu_read(soft_watchdog_warn) == true) {
338 /*
339 * When multiple processes are causing softlockups the
340 * softlockup detector only warns on the first one
341 * because the code relies on a full quiet cycle to
342 * re-arm. The second process prevents the quiet cycle
343 * and never gets reported. Use task pointers to detect
344 * this.
345 */
346 if (__this_cpu_read(softlockup_task_ptr_saved) !=
347 current) {
348 __this_cpu_write(soft_watchdog_warn, false);
349 __touch_watchdog();
350 }
337 return HRTIMER_RESTART; 351 return HRTIMER_RESTART;
352 }
338 353
339 if (softlockup_all_cpu_backtrace) { 354 if (softlockup_all_cpu_backtrace) {
340 /* Prevent multiple soft-lockup reports if one cpu is already 355 /* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
350 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 365 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
351 smp_processor_id(), duration, 366 smp_processor_id(), duration,
352 current->comm, task_pid_nr(current)); 367 current->comm, task_pid_nr(current));
368 __this_cpu_write(softlockup_task_ptr_saved, current);
353 print_modules(); 369 print_modules();
354 print_irqtrace_events(current); 370 print_irqtrace_events(current);
355 if (regs) 371 if (regs)