aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2005-09-06 18:16:27 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-07 19:57:17 -0400
commit8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 (patch)
tree738853af877c9a391b4f2db467e7f90c6e2e38ed
parent4732efbeb997189d9f9b04708dc26bf8613ed721 (diff)
[PATCH] detect soft lockups
This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP. When enabled then per-CPU watchdog threads are started, which try to run once per second. If they get delayed for more than 10 seconds then a callback from the timer interrupt detects this condition and prints out a warning message and a stack dump (once per lockup incident). The feature is otherwise non-intrusive, it doesnt try to unlock the box in any way, it only gets the debug info out, automatically, and on all CPUs affected by the lockup. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de> Signed-off-by: Richard Purdie <rpurdie@rpsys.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/nmi.c5
-rw-r--r--arch/i386/kernel/time.c1
-rw-r--r--arch/x86_64/kernel/nmi.c2
-rw-r--r--arch/x86_64/kernel/time.c1
-rw-r--r--drivers/mtd/nand/nand_base.c1
-rw-r--r--include/linux/sched.h17
-rw-r--r--init/main.c1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/power/swsusp.c1
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/timer.c1
-rw-r--r--lib/Kconfig.debug19
12 files changed, 201 insertions, 0 deletions
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8bbdbda07a2d..0178457db721 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
478 */ 478 */
479 for (i = 0; i < NR_CPUS; i++) 479 for (i = 0; i < NR_CPUS; i++)
480 alert_counter[i] = 0; 480 alert_counter[i] = 0;
481
482 /*
483 * Tickle the softlockup detector too:
484 */
485 touch_softlockup_watchdog();
481} 486}
482 487
483extern void die_nmi(struct pt_regs *, const char *msg); 488extern void die_nmi(struct pt_regs *, const char *msg);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f794a78ee1e..b0c5ee2b3446 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
422 last_timer->resume(); 422 last_timer->resume();
423 cur_timer = last_timer; 423 cur_timer = last_timer;
424 last_timer = NULL; 424 last_timer = NULL;
425 touch_softlockup_watchdog();
425 return 0; 426 return 0;
426} 427}
427 428
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 64a8e05d5811..84cae81fff8b 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
463 */ 463 */
464 for (i = 0; i < NR_CPUS; i++) 464 for (i = 0; i < NR_CPUS; i++)
465 per_cpu(nmi_touch, i) = 1; 465 per_cpu(nmi_touch, i) = 1;
466
467 touch_softlockup_watchdog();
466} 468}
467 469
468void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 470void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 66bf6ddeb0c3..2b5d9da912a2 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
1041 write_sequnlock_irqrestore(&xtime_lock,flags); 1041 write_sequnlock_irqrestore(&xtime_lock,flags);
1042 jiffies += sleep_length; 1042 jiffies += sleep_length;
1043 wall_jiffies += sleep_length; 1043 wall_jiffies += sleep_length;
1044 touch_softlockup_watchdog();
1044 return 0; 1045 return 0;
1045} 1046}
1046 1047
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index eee5115658c8..04e54318bc6a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
526 do { 526 do {
527 if (this->dev_ready(mtd)) 527 if (this->dev_ready(mtd))
528 return; 528 return;
529 touch_softlockup_watchdog();
529 } while (time_before(jiffies, timeo)); 530 } while (time_before(jiffies, timeo));
530} 531}
531 532
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c7742..5fb31bede103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
176extern void update_process_times(int user); 176extern void update_process_times(int user);
177extern void scheduler_tick(void); 177extern void scheduler_tick(void);
178 178
179#ifdef CONFIG_DETECT_SOFTLOCKUP
180extern void softlockup_tick(struct pt_regs *regs);
181extern void spawn_softlockup_task(void);
182extern void touch_softlockup_watchdog(void);
183#else
184static inline void softlockup_tick(struct pt_regs *regs)
185{
186}
187static inline void spawn_softlockup_task(void)
188{
189}
190static inline void touch_softlockup_watchdog(void)
191{
192}
193#endif
194
195
179/* Attach to any functions which should be ignored in wchan output. */ 196/* Attach to any functions which should be ignored in wchan output. */
180#define __sched __attribute__((__section__(".sched.text"))) 197#define __sched __attribute__((__section__(".sched.text")))
181/* Is this address in the __sched functions? */ 198/* Is this address in the __sched functions? */
diff --git a/init/main.c b/init/main.c
index ff410063e4e1..a29fb2ac7240 100644
--- a/init/main.c
+++ b/init/main.c
@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
614 migration_init(); 614 migration_init();
615#endif 615#endif
616 spawn_ksoftirqd(); 616 spawn_ksoftirqd();
617 spawn_softlockup_task();
617} 618}
618 619
619static void run_init_process(char *init_filename) 620static void run_init_process(char *init_filename)
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
28obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
30obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
32obj-$(CONFIG_SECCOMP) += seccomp.o 33obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
1059 BUG_ON(!error); 1059 BUG_ON(!error);
1060 restore_processor_state(); 1060 restore_processor_state();
1061 restore_highmem(); 1061 restore_highmem();
1062 touch_softlockup_watchdog();
1062 device_power_up(); 1063 device_power_up();
1063 local_irq_enable(); 1064 local_irq_enable();
1064 return error; 1065 return error;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9
10#include <linux/mm.h>
11#include <linux/cpu.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/kthread.h>
15#include <linux/notifier.h>
16#include <linux/module.h>
17
18static DEFINE_SPINLOCK(print_lock);
19
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23
24static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event,
26 void *ptr)
27{
28 did_panic = 1;
29
30 return NOTIFY_DONE;
31}
32
33static struct notifier_block panic_block = {
34 .notifier_call = softlock_panic,
35};
36
37void touch_softlockup_watchdog(void)
38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
40}
41EXPORT_SYMBOL(touch_softlockup_watchdog);
42
43/*
44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not:
46 */
47void softlockup_tick(struct pt_regs *regs)
48{
49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu);
51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp)
53 return;
54
55 /* Do not cause a second panic when there already was one */
56 if (did_panic)
57 return;
58
59 if (time_after(jiffies, timestamp + 10*HZ)) {
60 per_cpu(print_timestamp, this_cpu) = timestamp;
61
62 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu);
65 show_regs(regs);
66 spin_unlock(&print_lock);
67 }
68}
69
70/*
71 * The watchdog thread - runs every second and touches the timestamp.
72 */
73static int watchdog(void * __bind_cpu)
74{
75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79
80 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE;
82
83 set_current_state(TASK_INTERRUPTIBLE);
84
85 /*
86 * Run briefly once per second - if this gets delayed for
87 * more than 10 seconds then the debug-printout triggers
88 * in softlockup_tick():
89 */
90 while (!kthread_should_stop()) {
91 msleep_interruptible(1000);
92 touch_softlockup_watchdog();
93 }
94 __set_current_state(TASK_RUNNING);
95
96 return 0;
97}
98
99/*
100 * Create/destroy watchdog threads as CPUs come and go:
101 */
102static int __devinit
103cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
104{
105 int hotcpu = (unsigned long)hcpu;
106 struct task_struct *p;
107
108 switch (action) {
109 case CPU_UP_PREPARE:
110 BUG_ON(per_cpu(watchdog_task, hotcpu));
111 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
112 if (IS_ERR(p)) {
113 printk("watchdog for %i failed\n", hotcpu);
114 return NOTIFY_BAD;
115 }
116 per_cpu(watchdog_task, hotcpu) = p;
117 kthread_bind(p, hotcpu);
118 break;
119 case CPU_ONLINE:
120
121 wake_up_process(per_cpu(watchdog_task, hotcpu));
122 break;
123#ifdef CONFIG_HOTPLUG_CPU
124 case CPU_UP_CANCELED:
125 /* Unbind so it can run. Fall thru. */
126 kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
127 case CPU_DEAD:
128 p = per_cpu(watchdog_task, hotcpu);
129 per_cpu(watchdog_task, hotcpu) = NULL;
130 kthread_stop(p);
131 break;
132#endif /* CONFIG_HOTPLUG_CPU */
133 }
134 return NOTIFY_OK;
135}
136
137static struct notifier_block __devinitdata cpu_nfb = {
138 .notifier_call = cpu_callback
139};
140
141__init void spawn_softlockup_task(void)
142{
143 void *cpu = (void *)(long)smp_processor_id();
144
145 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
146 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
147 register_cpu_notifier(&cpu_nfb);
148
149 notifier_chain_register(&panic_notifier_list, &panic_block);
150}
151
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..1433d87f46b3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
950{ 950{
951 jiffies_64++; 951 jiffies_64++;
952 update_times(); 952 update_times();
953 softlockup_tick(regs);
953} 954}
954 955
955#ifdef __ARCH_WANT_SYS_ALARM 956#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 299f7f3b5b08..3754c9a8f5c8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
46 13 => 8 KB 46 13 => 8 KB
47 12 => 4 KB 47 12 => 4 KB
48 48
49config DETECT_SOFTLOCKUP
50 bool "Detect Soft Lockups"
51 depends on DEBUG_KERNEL
52 default y
53 help
54 Say Y here to enable the kernel to detect "soft lockups",
55 which are bugs that cause the kernel to loop in kernel
56 mode for more than 10 seconds, without giving other tasks a
57 chance to run.
58
59 When a soft-lockup is detected, the kernel will print the
60 current stack trace (which you should report), but the
61 system will stay locked up. This feature has negligible
62 overhead.
63
64 (Note that "hard lockups" are separate type of bugs that
65 can be detected via the NMI-watchdog, on platforms that
66 support it.)
67
49config SCHEDSTATS 68config SCHEDSTATS
50 bool "Collect scheduler statistics" 69 bool "Collect scheduler statistics"
51 depends on DEBUG_KERNEL && PROC_FS 70 depends on DEBUG_KERNEL && PROC_FS