aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-03-06 16:37:09 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-03-13 17:43:56 -0400
commit6231069bdab575fce862ca786f1c0ba5e4e9ba3b (patch)
tree95c88dfa505c5ae3b27df9bcffb07edf3dae511f
parent49d0de082c31de34cc896c14eec5f1c2ade0415a (diff)
rcu: Add softirq-stall indications to stall-warning messages
If RCU's softirq handler is prevented from executing, an RCU CPU stall warning can result. Ways to prevent RCU's softirq handler from executing include: (1) CPU spinning with interrupts disabled, (2) infinite loop in some softirq handler, and (3) in -rt kernels, an infinite loop in a set of real-time threads running at priorities higher than that of RCU's softirq handler. Because this situation can be difficult to track down, this commit causes the count of RCU softirq handler invocations to be printed with RCU CPU stall warnings. This information does require some interpretation, as now documented in Documentation/RCU/stallwarn.txt. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com>
-rw-r--r--Documentation/RCU/stallwarn.txt33
-rw-r--r--kernel/rcutree.h5
-rw-r--r--kernel/rcutree_plugin.h4
3 files changed, 32 insertions, 10 deletions
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1927151b386b..e38b8df3d727 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set,
92more information is printed with the stall-warning message, for example: 92more information is printed with the stall-warning message, for example:
93 93
94 INFO: rcu_preempt detected stall on CPU 94 INFO: rcu_preempt detected stall on CPU
95 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 95 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
96 (t=65000 jiffies) 96 (t=65000 jiffies)
97 97
98In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is 98In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99printed: 99printed:
100 100
101 INFO: rcu_preempt detected stall on CPU 101 INFO: rcu_preempt detected stall on CPU
102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending 102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
103 (t=65000 jiffies) 103 (t=65000 jiffies)
104 104
105The "(64628 ticks this GP)" indicates that this CPU has taken more 105The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will
116be a small positive number if in the idle loop and a very large positive 116be a small positive number if in the idle loop and a very large positive
117number (as shown above) otherwise. 117number (as shown above) otherwise.
118 118
119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is 119The "softirq=" portion of the message tracks the number of RCU softirq
120not in the process of trying to force itself into dyntick-idle state, the 120handlers that the stalled CPU has executed. The number before the "/"
121"." indicates that the CPU has not given up forcing RCU into dyntick-idle 121is the number that had executed since boot at the time that this CPU
122mode (it would be "H" otherwise), and the "timer not pending" indicates 122last noted the beginning of a grace period, which might be the current
123that the CPU has not recently forced RCU into dyntick-idle mode (it 123(stalled) grace period, or it might be some earlier grace period (for
124would otherwise indicate the number of microseconds remaining in this 124example, if the CPU might have been in dyntick-idle mode for an extended
125forced state). 125time period. The number after the "/" is the number that have executed
126since boot until the current time. If this latter number stays constant
127across repeated stall-warning messages, it is possible that RCU's softirq
128handlers are no longer able to execute on this CPU. This can happen if
129the stalled CPU is spinning with interrupts are disabled, or, in -rt
130kernels, if a high-priority process is starving RCU's softirq handler.
131
132For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
133low-order 16 bits (in hex) of the jiffies counter when this CPU last
134invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
135rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
136prints the number of non-lazy callbacks posted since the last call to
137rcu_needs_cpu(). Finally, an "L" indicates that there are currently
138no non-lazy callbacks ("." is printed otherwise, as shown above) and
139"D" indicates that dyntick-idle processing is enabled ("." is printed
140otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
126 141
127 142
128Multiple Warnings From One Stall 143Multiple Warnings From One Stall
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8e756099a1a8..26c9bb3166a3 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -325,6 +325,11 @@ struct rcu_data {
325 struct task_struct *nocb_kthread; 325 struct task_struct *nocb_kthread;
326#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 326#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
327 327
328 /* 8) RCU CPU stall data. */
329#ifdef CONFIG_RCU_CPU_STALL_INFO
330 unsigned int softirq_snap; /* Snapshot of softirq activity. */
331#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
332
328 int cpu; 333 int cpu;
329 struct rcu_state *rsp; 334 struct rcu_state *rsp;
330}; 335};
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..7fcd3bbf67da 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2070,10 +2070,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2070 ticks_value = rsp->gpnum - rdp->gpnum; 2070 ticks_value = rsp->gpnum - rdp->gpnum;
2071 } 2071 }
2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", 2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 cpu, ticks_value, ticks_title, 2074 cpu, ticks_value, ticks_title,
2075 atomic_read(&rdtp->dynticks) & 0xfff, 2075 atomic_read(&rdtp->dynticks) & 0xfff,
2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2077 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 fast_no_hz); 2078 fast_no_hz);
2078} 2079}
2079 2080
@@ -2087,6 +2088,7 @@ static void print_cpu_stall_info_end(void)
2087static void zero_cpu_stall_ticks(struct rcu_data *rdp) 2088static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088{ 2089{
2089 rdp->ticks_this_gp = 0; 2090 rdp->ticks_this_gp = 0;
2091 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090} 2092}
2091 2093
2092/* Increment ->ticks_this_gp for all flavors of RCU. */ 2094/* Increment ->ticks_this_gp for all flavors of RCU. */