aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2008-10-02 19:06:39 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-03 04:36:08 -0400
commit2133b5d7ff531bc15a923db4a6a50bf96c561be9 (patch)
tree5917515eaec573fbc3d4a734769d6184beb83dbb
parentb5259d944279d0b7e78a83849a352d8ba0447c4c (diff)
rcu: RCU-based detection of stalled CPUs for Classic RCU
This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/rcuclassic.h12
-rw-r--r--kernel/rcuclassic.c166
-rw-r--r--lib/Kconfig.debug2
3 files changed, 96 insertions, 84 deletions
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 29bf528c7dcc..5f89b62e6983 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -40,15 +40,21 @@
40#include <linux/cpumask.h> 40#include <linux/cpumask.h>
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42 42
43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
44#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */
45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
43 47
44/* Global control variables for rcupdate callback mechanism. */ 48/* Global control variables for rcupdate callback mechanism. */
45struct rcu_ctrlblk { 49struct rcu_ctrlblk {
46 long cur; /* Current batch number. */ 50 long cur; /* Current batch number. */
47 long completed; /* Number of the last completed batch */ 51 long completed; /* Number of the last completed batch */
48 long pending; /* Number of the last pending batch */ 52 long pending; /* Number of the last pending batch */
49#ifdef CONFIG_DEBUG_RCU_STALL 53#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
50 unsigned long gp_check; /* Time grace period should end, in seconds. */ 54 unsigned long gp_start; /* Time at which GP started in jiffies. */
51#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ 55 unsigned long jiffies_stall;
56 /* Time at which to check for CPU stalls. */
57#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
52 58
53 int signaled; 59 int signaled;
54 60
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ed15128ca2c9..0d07e6e51578 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
164 } 164 }
165} 165}
166 166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
167/** 248/**
168 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
169 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
293 * period (if necessary). 374 * period (if necessary).
294 */ 375 */
295 376
296#ifdef CONFIG_DEBUG_RCU_STALL
297
298static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
299{
300 rcp->gp_check = get_seconds() + 3;
301}
302
303static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
304{
305 int cpu;
306 long delta;
307 unsigned long flags;
308
309 /* Only let one CPU complain about others per time interval. */
310
311 spin_lock_irqsave(&rcp->lock, flags);
312 delta = get_seconds() - rcp->gp_check;
313 if (delta < 2L || cpus_empty(rcp->cpumask)) {
314 spin_unlock(&rcp->lock);
315 return;
316 }
317 rcp->gp_check = get_seconds() + 30;
318 spin_unlock_irqrestore(&rcp->lock, flags);
319
320 /* OK, time to rat on our buddy... */
321
322 printk(KERN_ERR "RCU detected CPU stalls:");
323 for_each_cpu_mask(cpu, rcp->cpumask)
324 printk(" %d", cpu);
325 printk(" (detected by %d, t=%lu/%lu)\n",
326 smp_processor_id(), get_seconds(), rcp->gp_check);
327}
328
329static void print_cpu_stall(struct rcu_ctrlblk *rcp)
330{
331 unsigned long flags;
332
333 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
334 smp_processor_id(), get_seconds(), rcp->gp_check);
335 dump_stack();
336 spin_lock_irqsave(&rcp->lock, flags);
337 if ((long)(get_seconds() - rcp->gp_check) >= 0L)
338 rcp->gp_check = get_seconds() + 30;
339 spin_unlock_irqrestore(&rcp->lock, flags);
340}
341
342static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
343{
344 long delta;
345
346 delta = get_seconds() - rcp->gp_check;
347 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
348
349 /* We haven't checked in, so go dump stack. */
350
351 print_cpu_stall(rcp);
352
353 } else {
354 if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
355 /* They had two seconds to dump stack, so complain. */
356 print_other_cpu_stall(rcp);
357 }
358 }
359}
360
361#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
362
363static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
364{
365}
366
367static inline void
368check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
369{
370}
371
372#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
373
374/* 377/*
375 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
376 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
381 if (rcp->cur != rcp->pending && 384 if (rcp->cur != rcp->pending &&
382 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
383 rcp->cur++; 386 rcp->cur++;
384 record_gp_check_time(rcp); 387 record_gp_stall_check_time(rcp);
385 388
386 /* 389 /*
387 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
603static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
604{ 607{
605 /* Check for CPU stalls, if enabled. */ 608 /* Check for CPU stalls, if enabled. */
606 check_cpu_stall(rcp, rdp); 609 check_cpu_stall(rcp);
607 610
608 if (rdp->nxtlist) { 611 if (rdp->nxtlist) {
609 long completed_snap = ACCESS_ONCE(rcp->completed); 612 long completed_snap = ACCESS_ONCE(rcp->completed);
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
769 */ 772 */
770void __init __rcu_init(void) 773void __init __rcu_init(void)
771{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
772 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
773 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
774 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ccede1aeab38..9fee969dd60e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE
597 Say N here if you want the RCU torture tests to start only 597 Say N here if you want the RCU torture tests to start only
598 after being manually enabled via /proc. 598 after being manually enabled via /proc.
599 599
600config RCU_CPU_STALL 600config RCU_CPU_STALL_DETECTOR
601 bool "Check for stalled CPUs delaying RCU grace periods" 601 bool "Check for stalled CPUs delaying RCU grace periods"
602 depends on CLASSIC_RCU 602 depends on CLASSIC_RCU
603 default n 603 default n