diff options
| author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2008-10-02 19:06:39 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-10-03 04:36:08 -0400 |
| commit | 2133b5d7ff531bc15a923db4a6a50bf96c561be9 (patch) | |
| tree | 5917515eaec573fbc3d4a734769d6184beb83dbb | |
| parent | b5259d944279d0b7e78a83849a352d8ba0447c4c (diff) | |
rcu: RCU-based detection of stalled CPUs for Classic RCU
This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.
This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.
This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds. This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).
The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
| -rw-r--r-- | include/linux/rcuclassic.h | 12 | ||||
| -rw-r--r-- | kernel/rcuclassic.c | 166 | ||||
| -rw-r--r-- | lib/Kconfig.debug | 2 |
3 files changed, 96 insertions, 84 deletions
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 29bf528c7dcc..5f89b62e6983 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h | |||
| @@ -40,15 +40,21 @@ | |||
| 40 | #include <linux/cpumask.h> | 40 | #include <linux/cpumask.h> |
| 41 | #include <linux/seqlock.h> | 41 | #include <linux/seqlock.h> |
| 42 | 42 | ||
| 43 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 44 | #define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ | ||
| 45 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ | ||
| 46 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 43 | 47 | ||
| 44 | /* Global control variables for rcupdate callback mechanism. */ | 48 | /* Global control variables for rcupdate callback mechanism. */ |
| 45 | struct rcu_ctrlblk { | 49 | struct rcu_ctrlblk { |
| 46 | long cur; /* Current batch number. */ | 50 | long cur; /* Current batch number. */ |
| 47 | long completed; /* Number of the last completed batch */ | 51 | long completed; /* Number of the last completed batch */ |
| 48 | long pending; /* Number of the last pending batch */ | 52 | long pending; /* Number of the last pending batch */ |
| 49 | #ifdef CONFIG_DEBUG_RCU_STALL | 53 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| 50 | unsigned long gp_check; /* Time grace period should end, in seconds. */ | 54 | unsigned long gp_start; /* Time at which GP started in jiffies. */ |
| 51 | #endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ | 55 | unsigned long jiffies_stall; |
| 56 | /* Time at which to check for CPU stalls. */ | ||
| 57 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 52 | 58 | ||
| 53 | int signaled; | 59 | int signaled; |
| 54 | 60 | ||
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128ca2c9..0d07e6e51578 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c | |||
| @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, | |||
| 164 | } | 164 | } |
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 168 | |||
| 169 | static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) | ||
| 170 | { | ||
| 171 | rcp->gp_start = jiffies; | ||
| 172 | rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; | ||
| 173 | } | ||
| 174 | |||
| 175 | static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 176 | { | ||
| 177 | int cpu; | ||
| 178 | long delta; | ||
| 179 | unsigned long flags; | ||
| 180 | |||
| 181 | /* Only let one CPU complain about others per time interval. */ | ||
| 182 | |||
| 183 | spin_lock_irqsave(&rcp->lock, flags); | ||
| 184 | delta = jiffies - rcp->jiffies_stall; | ||
| 185 | if (delta < 2 || rcp->cur != rcp->completed) { | ||
| 186 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
| 187 | return; | ||
| 188 | } | ||
| 189 | rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | ||
| 190 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
| 191 | |||
| 192 | /* OK, time to rat on our buddy... */ | ||
| 193 | |||
| 194 | printk(KERN_ERR "RCU detected CPU stalls:"); | ||
| 195 | for_each_possible_cpu(cpu) { | ||
| 196 | if (cpu_isset(cpu, rcp->cpumask)) | ||
| 197 | printk(" %d", cpu); | ||
| 198 | } | ||
| 199 | printk(" (detected by %d, t=%ld jiffies)\n", | ||
| 200 | smp_processor_id(), (long)(jiffies - rcp->gp_start)); | ||
| 201 | } | ||
| 202 | |||
| 203 | static void print_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 204 | { | ||
| 205 | unsigned long flags; | ||
| 206 | |||
| 207 | printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", | ||
| 208 | smp_processor_id(), jiffies, | ||
| 209 | jiffies - rcp->gp_start); | ||
| 210 | dump_stack(); | ||
| 211 | spin_lock_irqsave(&rcp->lock, flags); | ||
| 212 | if ((long)(jiffies - rcp->jiffies_stall) >= 0) | ||
| 213 | rcp->jiffies_stall = | ||
| 214 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | ||
| 215 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
| 216 | set_need_resched(); /* kick ourselves to get things going. */ | ||
| 217 | } | ||
| 218 | |||
| 219 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 220 | { | ||
| 221 | long delta; | ||
| 222 | |||
| 223 | delta = jiffies - rcp->jiffies_stall; | ||
| 224 | if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { | ||
| 225 | |||
| 226 | /* We haven't checked in, so go dump stack. */ | ||
| 227 | print_cpu_stall(rcp); | ||
| 228 | |||
| 229 | } else if (rcp->cur != rcp->completed && delta >= 2) { | ||
| 230 | |||
| 231 | /* They had two seconds to dump stack, so complain. */ | ||
| 232 | print_other_cpu_stall(rcp); | ||
| 233 | } | ||
| 234 | } | ||
| 235 | |||
| 236 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 237 | |||
| 238 | static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) | ||
| 239 | { | ||
| 240 | } | ||
| 241 | |||
| 242 | static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 243 | { | ||
| 244 | } | ||
| 245 | |||
| 246 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 247 | |||
| 167 | /** | 248 | /** |
| 168 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 249 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
| 169 | * @head: structure to be used for queueing the RCU updates. | 250 | * @head: structure to be used for queueing the RCU updates. |
| @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 293 | * period (if necessary). | 374 | * period (if necessary). |
| 294 | */ | 375 | */ |
| 295 | 376 | ||
| 296 | #ifdef CONFIG_DEBUG_RCU_STALL | ||
| 297 | |||
| 298 | static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) | ||
| 299 | { | ||
| 300 | rcp->gp_check = get_seconds() + 3; | ||
| 301 | } | ||
| 302 | |||
| 303 | static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 304 | { | ||
| 305 | int cpu; | ||
| 306 | long delta; | ||
| 307 | unsigned long flags; | ||
| 308 | |||
| 309 | /* Only let one CPU complain about others per time interval. */ | ||
| 310 | |||
| 311 | spin_lock_irqsave(&rcp->lock, flags); | ||
| 312 | delta = get_seconds() - rcp->gp_check; | ||
| 313 | if (delta < 2L || cpus_empty(rcp->cpumask)) { | ||
| 314 | spin_unlock(&rcp->lock); | ||
| 315 | return; | ||
| 316 | } | ||
| 317 | rcp->gp_check = get_seconds() + 30; | ||
| 318 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
| 319 | |||
| 320 | /* OK, time to rat on our buddy... */ | ||
| 321 | |||
| 322 | printk(KERN_ERR "RCU detected CPU stalls:"); | ||
| 323 | for_each_cpu_mask(cpu, rcp->cpumask) | ||
| 324 | printk(" %d", cpu); | ||
| 325 | printk(" (detected by %d, t=%lu/%lu)\n", | ||
| 326 | smp_processor_id(), get_seconds(), rcp->gp_check); | ||
| 327 | } | ||
| 328 | |||
| 329 | static void print_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 330 | { | ||
| 331 | unsigned long flags; | ||
| 332 | |||
| 333 | printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", | ||
| 334 | smp_processor_id(), get_seconds(), rcp->gp_check); | ||
| 335 | dump_stack(); | ||
| 336 | spin_lock_irqsave(&rcp->lock, flags); | ||
| 337 | if ((long)(get_seconds() - rcp->gp_check) >= 0L) | ||
| 338 | rcp->gp_check = get_seconds() + 30; | ||
| 339 | spin_unlock_irqrestore(&rcp->lock, flags); | ||
| 340 | } | ||
| 341 | |||
| 342 | static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 343 | { | ||
| 344 | long delta; | ||
| 345 | |||
| 346 | delta = get_seconds() - rcp->gp_check; | ||
| 347 | if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { | ||
| 348 | |||
| 349 | /* We haven't checked in, so go dump stack. */ | ||
| 350 | |||
| 351 | print_cpu_stall(rcp); | ||
| 352 | |||
| 353 | } else { | ||
| 354 | if (!cpus_empty(rcp->cpumask) && delta >= 2L) { | ||
| 355 | /* They had two seconds to dump stack, so complain. */ | ||
| 356 | print_other_cpu_stall(rcp); | ||
| 357 | } | ||
| 358 | } | ||
| 359 | } | ||
| 360 | |||
| 361 | #else /* #ifdef CONFIG_DEBUG_RCU_STALL */ | ||
| 362 | |||
| 363 | static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) | ||
| 364 | { | ||
| 365 | } | ||
| 366 | |||
| 367 | static inline void | ||
| 368 | check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 369 | { | ||
| 370 | } | ||
| 371 | |||
| 372 | #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ | ||
| 373 | |||
| 374 | /* | 377 | /* |
| 375 | * Register a new batch of callbacks, and start it up if there is currently no | 378 | * Register a new batch of callbacks, and start it up if there is currently no |
| 376 | * active batch and the batch to be registered has not already occurred. | 379 | * active batch and the batch to be registered has not already occurred. |
| @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) | |||
| 381 | if (rcp->cur != rcp->pending && | 384 | if (rcp->cur != rcp->pending && |
| 382 | rcp->completed == rcp->cur) { | 385 | rcp->completed == rcp->cur) { |
| 383 | rcp->cur++; | 386 | rcp->cur++; |
| 384 | record_gp_check_time(rcp); | 387 | record_gp_stall_check_time(rcp); |
| 385 | 388 | ||
| 386 | /* | 389 | /* |
| 387 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | 390 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a |
| @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
| 603 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | 606 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) |
| 604 | { | 607 | { |
| 605 | /* Check for CPU stalls, if enabled. */ | 608 | /* Check for CPU stalls, if enabled. */ |
| 606 | check_cpu_stall(rcp, rdp); | 609 | check_cpu_stall(rcp); |
| 607 | 610 | ||
| 608 | if (rdp->nxtlist) { | 611 | if (rdp->nxtlist) { |
| 609 | long completed_snap = ACCESS_ONCE(rcp->completed); | 612 | long completed_snap = ACCESS_ONCE(rcp->completed); |
| @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { | |||
| 769 | */ | 772 | */ |
| 770 | void __init __rcu_init(void) | 773 | void __init __rcu_init(void) |
| 771 | { | 774 | { |
| 775 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 776 | printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); | ||
| 777 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 772 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 778 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, |
| 773 | (void *)(long)smp_processor_id()); | 779 | (void *)(long)smp_processor_id()); |
| 774 | /* Register notifier for non-boot CPUs */ | 780 | /* Register notifier for non-boot CPUs */ |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ccede1aeab38..9fee969dd60e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
| @@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE | |||
| 597 | Say N here if you want the RCU torture tests to start only | 597 | Say N here if you want the RCU torture tests to start only |
| 598 | after being manually enabled via /proc. | 598 | after being manually enabled via /proc. |
| 599 | 599 | ||
| 600 | config RCU_CPU_STALL | 600 | config RCU_CPU_STALL_DETECTOR |
| 601 | bool "Check for stalled CPUs delaying RCU grace periods" | 601 | bool "Check for stalled CPUs delaying RCU grace periods" |
| 602 | depends on CLASSIC_RCU | 602 | depends on CLASSIC_RCU |
| 603 | default n | 603 | default n |
