aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2008-08-10 21:35:38 -0400
committerIngo Molnar <mingo@elte.hu>2008-08-11 07:35:18 -0400
commit67182ae1c42206e516f7efb292b745e826497b24 (patch)
treed2d402550a0432489090264df95a8154597dc989
parentc4c0c56a7a85ed5725786219e4fbca7e840b1531 (diff)
rcu, debug: detect stalled grace periods
this is a diagnostic patch for Classic RCU. The approach is to record a timestamp at the beginning of the grace period (in rcu_start_batch()), then have rcu_check_callbacks() complain if: 1. it is running on a CPU that has holding up grace periods for a long time (say one second). This will identify the culprit assuming that the culprit has not disabled hardware irqs, instruction execution, or some such. 2. it is running on a CPU that is not holding up grace periods, but grace periods have been held up for an even longer time (say two seconds). It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter. Rather than exponential backoff, it backs off to once per 30 seconds. My feeling upon thinking on it was that if you have stalled RCU grace periods for that long, a few extra printk() messages are probably the least of your worries... Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Cc: David Witbrodt <dawitbro@sbcglobal.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/rcuclassic.h3
-rw-r--r--kernel/rcuclassic.c80
-rw-r--r--lib/Kconfig.debug13
3 files changed, 96 insertions, 0 deletions
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 04c728147be0..16589958b40e 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -46,6 +46,9 @@ struct rcu_ctrlblk {
46 long cur; /* Current batch number. */ 46 long cur; /* Current batch number. */
47 long completed; /* Number of the last completed batch */ 47 long completed; /* Number of the last completed batch */
48 long pending; /* Number of the last pending batch */ 48 long pending; /* Number of the last pending batch */
49#ifdef CONFIG_DEBUG_RCU_STALL
50 unsigned long gp_check; /* Time grace period should end, in seconds. */
51#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
49 52
50 int signaled; 53 int signaled;
51 54
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d4271146a9bd..d7ec731de75c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp)
286 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 287 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
287 * period (if necessary). 288 * period (if necessary).
288 */ 289 */
290
291#ifdef CONFIG_DEBUG_RCU_STALL
292
293static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
294{
295 rcp->gp_check = get_seconds() + 3;
296}
297static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
298{
299 int cpu;
300 long delta;
301
302 /* Only let one CPU complain about others per time interval. */
303
304 spin_lock(&rcp->lock);
305 delta = get_seconds() - rcp->gp_check;
306 if (delta < 2L ||
307 cpus_empty(rcp->cpumask)) {
308 spin_unlock(&rcp->lock);
309 return;
310 rcp->gp_check = get_seconds() + 30;
311 }
312 spin_unlock(&rcp->lock);
313
314 /* OK, time to rat on our buddy... */
315
316 printk(KERN_ERR "RCU detected CPU stalls:");
317 for_each_cpu_mask(cpu, rcp->cpumask)
318 printk(" %d", cpu);
319 printk(" (detected by %d, t=%lu/%lu)\n",
320 smp_processor_id(), get_seconds(), rcp->gp_check);
321}
322static void print_cpu_stall(struct rcu_ctrlblk *rcp)
323{
324 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
325 smp_processor_id(), get_seconds(), rcp->gp_check);
326 dump_stack();
327 spin_lock(&rcp->lock);
328 if ((long)(get_seconds() - rcp->gp_check) >= 0L)
329 rcp->gp_check = get_seconds() + 30;
330 spin_unlock(&rcp->lock);
331}
332static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
333 struct rcu_data *rdp)
334{
335 long delta;
336
337 delta = get_seconds() - rcp->gp_check;
338 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
339
340 /* We haven't checked in, so go dump stack. */
341
342 print_cpu_stall(rcp);
343
344 } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
345
346 /* They had two seconds to dump stack, so complain. */
347
348 print_other_cpu_stall(rcp);
349
350 }
351}
352
353#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
354
355static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
356{
357}
358static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
359 struct rcu_data *rdp)
360{
361}
362
363#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
364
289/* 365/*
290 * Register a new batch of callbacks, and start it up if there is currently no 366 * Register a new batch of callbacks, and start it up if there is currently no
291 * active batch and the batch to be registered has not already occurred. 367 * active batch and the batch to be registered has not already occurred.
@@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
296 if (rcp->cur != rcp->pending && 372 if (rcp->cur != rcp->pending &&
297 rcp->completed == rcp->cur) { 373 rcp->completed == rcp->cur) {
298 rcp->cur++; 374 rcp->cur++;
375 record_gp_check_time(rcp);
299 376
300 /* 377 /*
301 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 378 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
489 566
490static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 567static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
491{ 568{
569 /* Check for CPU stalls, if enabled. */
570 check_cpu_stall(rcp, rdp);
571
492 if (rdp->nxtlist) { 572 if (rdp->nxtlist) {
493 /* 573 /*
494 * This cpu has pending rcu entries and the grace period 574 * This cpu has pending rcu entries and the grace period
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e1d4764435ed..2fb6d90bf1e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
597 Say N here if you want the RCU torture tests to start only 597 Say N here if you want the RCU torture tests to start only
598 after being manually enabled via /proc. 598 after being manually enabled via /proc.
599 599
600config RCU_CPU_STALL
601 bool "Check for stalled CPUs delaying RCU grace periods"
602 depends on CLASSIC_RCU
603 default n
604 help
605 This option causes RCU to printk information on which
606 CPUs are delaying the current grace period, but only when
607 the grace period extends for excessive time periods.
608
609 Say Y if you want RCU to perform such checks.
610
611 Say N if you are unsure.
612
600config KPROBES_SANITY_TEST 613config KPROBES_SANITY_TEST
601 bool "Kprobes sanity tests" 614 bool "Kprobes sanity tests"
602 depends on DEBUG_KERNEL 615 depends on DEBUG_KERNEL