diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2008-10-02 19:06:39 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-03 04:36:08 -0400 |
commit | 2133b5d7ff531bc15a923db4a6a50bf96c561be9 (patch) | |
tree | 5917515eaec573fbc3d4a734769d6184beb83dbb | |
parent | b5259d944279d0b7e78a83849a352d8ba0447c4c (diff) |
rcu: RCU-based detection of stalled CPUs for Classic RCU
This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.
This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.
This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds. This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).
The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/rcuclassic.h | 12 | ||||
-rw-r--r-- | kernel/rcuclassic.c | 166 | ||||
-rw-r--r-- | lib/Kconfig.debug | 2 |
3 files changed, 96 insertions, 84 deletions
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 29bf528c7dcc..5f89b62e6983 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h | |||
@@ -40,15 +40,21 @@ | |||
40 | #include <linux/cpumask.h> | 40 | #include <linux/cpumask.h> |
41 | #include <linux/seqlock.h> | 41 | #include <linux/seqlock.h> |
42 | 42 | ||
43 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
44 | #define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ | ||
45 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ | ||
46 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
43 | 47 | ||
44 | /* Global control variables for rcupdate callback mechanism. */ | 48 | /* Global control variables for rcupdate callback mechanism. */ |
45 | struct rcu_ctrlblk { | 49 | struct rcu_ctrlblk { |
46 | long cur; /* Current batch number. */ | 50 | long cur; /* Current batch number. */ |
47 | long completed; /* Number of the last completed batch */ | 51 | long completed; /* Number of the last completed batch */ |
48 | long pending; /* Number of the last pending batch */ | 52 | long pending; /* Number of the last pending batch */ |
49 | #ifdef CONFIG_DEBUG_RCU_STALL | 53 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
50 | unsigned long gp_check; /* Time grace period should end, in seconds. */ | 54 | unsigned long gp_start; /* Time at which GP started in jiffies. */ |
51 | #endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ | 55 | unsigned long jiffies_stall; |
56 | /* Time at which to check for CPU stalls. */ | ||
57 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
52 | 58 | ||
53 | int signaled; | 59 | int signaled; |
54 | 60 | ||
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128ca2c9..0d07e6e51578 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c | |||
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, | |||
164 | } | 164 | } |
165 | } | 165 | } |
166 | 166 | ||
167 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
168 | |||
169 | static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) | ||
170 | { | ||
171 | rcp->gp_start = jiffies; | ||
172 | rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; | ||
173 | } | ||
174 | |||
175 | static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) | ||
176 | { | ||
177 | int cpu; | ||
178 | long delta; | ||
179 | unsigned long flags; | ||
180 | |||
181 | /* Only let one CPU complain about others per time interval. */ | ||
182 | |||
183 | spin_lock_irqsave(&rcp->lock, flags); | ||
184 | delta = jiffies - rcp->jiffies_stall; | ||
185 | if (delta < 2 || rcp->cur != rcp->completed) { | ||
186 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
187 | return; | ||
188 | } | ||
189 | rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | ||
190 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
191 | |||
192 | /* OK, time to rat on our buddy... */ | ||
193 | |||
194 | printk(KERN_ERR "RCU detected CPU stalls:"); | ||
195 | for_each_possible_cpu(cpu) { | ||
196 | if (cpu_isset(cpu, rcp->cpumask)) | ||
197 | printk(" %d", cpu); | ||
198 | } | ||
199 | printk(" (detected by %d, t=%ld jiffies)\n", | ||
200 | smp_processor_id(), (long)(jiffies - rcp->gp_start)); | ||
201 | } | ||
202 | |||
203 | static void print_cpu_stall(struct rcu_ctrlblk *rcp) | ||
204 | { | ||
205 | unsigned long flags; | ||
206 | |||
207 | printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", | ||
208 | smp_processor_id(), jiffies, | ||
209 | jiffies - rcp->gp_start); | ||
210 | dump_stack(); | ||
211 | spin_lock_irqsave(&rcp->lock, flags); | ||
212 | if ((long)(jiffies - rcp->jiffies_stall) >= 0) | ||
213 | rcp->jiffies_stall = | ||
214 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | ||
215 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
216 | set_need_resched(); /* kick ourselves to get things going. */ | ||
217 | } | ||
218 | |||
219 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
220 | { | ||
221 | long delta; | ||
222 | |||
223 | delta = jiffies - rcp->jiffies_stall; | ||
224 | if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { | ||
225 | |||
226 | /* We haven't checked in, so go dump stack. */ | ||
227 | print_cpu_stall(rcp); | ||
228 | |||
229 | } else if (rcp->cur != rcp->completed && delta >= 2) { | ||
230 | |||
231 | /* They had two seconds to dump stack, so complain. */ | ||
232 | print_other_cpu_stall(rcp); | ||
233 | } | ||
234 | } | ||
235 | |||
236 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
237 | |||
238 | static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) | ||
239 | { | ||
240 | } | ||
241 | |||
242 | static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
243 | { | ||
244 | } | ||
245 | |||
246 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
247 | |||
167 | /** | 248 | /** |
168 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 249 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
169 | * @head: structure to be used for queueing the RCU updates. | 250 | * @head: structure to be used for queueing the RCU updates. |
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
293 | * period (if necessary). | 374 | * period (if necessary). |
294 | */ | 375 | */ |
295 | 376 | ||
296 | #ifdef CONFIG_DEBUG_RCU_STALL | ||
297 | |||
298 | static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) | ||
299 | { | ||
300 | rcp->gp_check = get_seconds() + 3; | ||
301 | } | ||
302 | |||
303 | static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) | ||
304 | { | ||
305 | int cpu; | ||
306 | long delta; | ||
307 | unsigned long flags; | ||
308 | |||
309 | /* Only let one CPU complain about others per time interval. */ | ||
310 | |||
311 | spin_lock_irqsave(&rcp->lock, flags); | ||
312 | delta = get_seconds() - rcp->gp_check; | ||
313 | if (delta < 2L || cpus_empty(rcp->cpumask)) { | ||
314 | spin_unlock(&rcp->lock); | ||
315 | return; | ||
316 | } | ||
317 | rcp->gp_check = get_seconds() + 30; | ||
318 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
319 | |||
320 | /* OK, time to rat on our buddy... */ | ||
321 | |||
322 | printk(KERN_ERR "RCU detected CPU stalls:"); | ||
323 | for_each_cpu_mask(cpu, rcp->cpumask) | ||
324 | printk(" %d", cpu); | ||
325 | printk(" (detected by %d, t=%lu/%lu)\n", | ||
326 | smp_processor_id(), get_seconds(), rcp->gp_check); | ||
327 | } | ||
328 | |||
329 | static void print_cpu_stall(struct rcu_ctrlblk *rcp) | ||
330 | { | ||
331 | unsigned long flags; | ||
332 | |||
333 | printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", | ||
334 | smp_processor_id(), get_seconds(), rcp->gp_check); | ||
335 | dump_stack(); | ||
336 | spin_lock_irqsave(&rcp->lock, flags); | ||
337 | if ((long)(get_seconds() - rcp->gp_check) >= 0L) | ||
338 | rcp->gp_check = get_seconds() + 30; | ||
339 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
340 | } | ||
341 | |||
342 | static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
343 | { | ||
344 | long delta; | ||
345 | |||
346 | delta = get_seconds() - rcp->gp_check; | ||
347 | if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { | ||
348 | |||
349 | /* We haven't checked in, so go dump stack. */ | ||
350 | |||
351 | print_cpu_stall(rcp); | ||
352 | |||
353 | } else { | ||
354 | if (!cpus_empty(rcp->cpumask) && delta >= 2L) { | ||
355 | /* They had two seconds to dump stack, so complain. */ | ||
356 | print_other_cpu_stall(rcp); | ||
357 | } | ||
358 | } | ||
359 | } | ||
360 | |||
361 | #else /* #ifdef CONFIG_DEBUG_RCU_STALL */ | ||
362 | |||
363 | static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) | ||
364 | { | ||
365 | } | ||
366 | |||
367 | static inline void | ||
368 | check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
369 | { | ||
370 | } | ||
371 | |||
372 | #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ | ||
373 | |||
374 | /* | 377 | /* |
375 | * Register a new batch of callbacks, and start it up if there is currently no | 378 | * Register a new batch of callbacks, and start it up if there is currently no |
376 | * active batch and the batch to be registered has not already occurred. | 379 | * active batch and the batch to be registered has not already occurred. |
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) | |||
381 | if (rcp->cur != rcp->pending && | 384 | if (rcp->cur != rcp->pending && |
382 | rcp->completed == rcp->cur) { | 385 | rcp->completed == rcp->cur) { |
383 | rcp->cur++; | 386 | rcp->cur++; |
384 | record_gp_check_time(rcp); | 387 | record_gp_stall_check_time(rcp); |
385 | 388 | ||
386 | /* | 389 | /* |
387 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | 390 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a |
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
603 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | 606 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) |
604 | { | 607 | { |
605 | /* Check for CPU stalls, if enabled. */ | 608 | /* Check for CPU stalls, if enabled. */ |
606 | check_cpu_stall(rcp, rdp); | 609 | check_cpu_stall(rcp); |
607 | 610 | ||
608 | if (rdp->nxtlist) { | 611 | if (rdp->nxtlist) { |
609 | long completed_snap = ACCESS_ONCE(rcp->completed); | 612 | long completed_snap = ACCESS_ONCE(rcp->completed); |
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { | |||
769 | */ | 772 | */ |
770 | void __init __rcu_init(void) | 773 | void __init __rcu_init(void) |
771 | { | 774 | { |
775 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
776 | printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); | ||
777 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
772 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 778 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, |
773 | (void *)(long)smp_processor_id()); | 779 | (void *)(long)smp_processor_id()); |
774 | /* Register notifier for non-boot CPUs */ | 780 | /* Register notifier for non-boot CPUs */ |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ccede1aeab38..9fee969dd60e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE | |||
597 | Say N here if you want the RCU torture tests to start only | 597 | Say N here if you want the RCU torture tests to start only |
598 | after being manually enabled via /proc. | 598 | after being manually enabled via /proc. |
599 | 599 | ||
600 | config RCU_CPU_STALL | 600 | config RCU_CPU_STALL_DETECTOR |
601 | bool "Check for stalled CPUs delaying RCU grace periods" | 601 | bool "Check for stalled CPUs delaying RCU grace periods" |
602 | depends on CLASSIC_RCU | 602 | depends on CLASSIC_RCU |
603 | default n | 603 | default n |