diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-08-02 05:28:21 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-08-02 07:27:17 -0400 |
commit | c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc (patch) | |
tree | 6822205799a6cf8928623d60aa226c95534a20f9 /kernel/rcutree.c | |
parent | ed680c4ad478d0fee9740f7d029087f181346564 (diff) |
debug lockups: Improve lockup detection
When debugging a recent lockup bug i found various deficiencies
in how our current lockup detection helpers work:
- SysRq-L is not very efficient as it uses a workqueue, hence
it cannot punch through hard lockups and cannot see through
most soft lockups either.
- The SysRq-L code depends on the NMI watchdog - which is off
by default.
- We dont print backtraces from the RCU code's built-in
'RCU state machine is stuck' debug code. This debug
code tends to be one of the first (and only) mechanisms
that show that a lockup has occured.
This patch changes the code so taht we:
- Trigger the NMI backtrace code from SysRq-L instead of using
a workqueue (which cannot punch through hard lockups)
- Trigger print-all-CPU-backtraces from the RCU lockup detection
code
Also decouple the backtrace printing code from the NMI watchdog:
- Dont use variable size cpumasks (it might not be initialized
and they are a bit more fragile anyway)
- Trigger an NMI immediately via an IPI, instead of waiting
for the NMI tick to occur. This is a lot faster and can
produce more relevant backtraces. It will also work if the
NMI watchdog is disabled.
- Dont print the 'dazed and confused' message when we print
a backtrace from the NMI
- Do a show_regs() plus a dump_stack() to get maximum info
out of the dump. Worst-case we get two stacktraces - which
is not a big deal. Sometimes, if register content is
corrupted, the precise stack walker in show_regs() wont
give us a full backtrace - in this case dump_stack() will
do it.
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r-- | kernel/rcutree.c | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c2027..9c5fa9fc57ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/rcupdate.h> | 35 | #include <linux/rcupdate.h> |
36 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
37 | #include <linux/sched.h> | 37 | #include <linux/sched.h> |
38 | #include <linux/nmi.h> | ||
38 | #include <asm/atomic.h> | 39 | #include <asm/atomic.h> |
39 | #include <linux/bitops.h> | 40 | #include <linux/bitops.h> |
40 | #include <linux/module.h> | 41 | #include <linux/module.h> |
@@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
469 | } | 470 | } |
470 | printk(" (detected by %d, t=%ld jiffies)\n", | 471 | printk(" (detected by %d, t=%ld jiffies)\n", |
471 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 472 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
473 | trigger_all_cpu_backtrace(); | ||
474 | |||
472 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 475 | force_quiescent_state(rsp, 0); /* Kick them all. */ |
473 | } | 476 | } |
474 | 477 | ||
@@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
479 | 482 | ||
480 | printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", | 483 | printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", |
481 | smp_processor_id(), jiffies - rsp->gp_start); | 484 | smp_processor_id(), jiffies - rsp->gp_start); |
482 | dump_stack(); | 485 | trigger_all_cpu_backtrace(); |
486 | |||
483 | spin_lock_irqsave(&rnp->lock, flags); | 487 | spin_lock_irqsave(&rnp->lock, flags); |
484 | if ((long)(jiffies - rsp->jiffies_stall) >= 0) | 488 | if ((long)(jiffies - rsp->jiffies_stall) >= 0) |
485 | rsp->jiffies_stall = | 489 | rsp->jiffies_stall = |
486 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 490 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
487 | spin_unlock_irqrestore(&rnp->lock, flags); | 491 | spin_unlock_irqrestore(&rnp->lock, flags); |
492 | |||
488 | set_need_resched(); /* kick ourselves to get things going. */ | 493 | set_need_resched(); /* kick ourselves to get things going. */ |
489 | } | 494 | } |
490 | 495 | ||