aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2009-10-15 12:26:14 -0400
committerIngo Molnar <mingo@elte.hu>2009-10-15 14:33:01 -0400
commit237c80c5c8fb7ec128cf2a756b550dc41ad7eac7 (patch)
tree5d6b3346f2c53cd3f7471001479a8dbd741533a3 /kernel
parent019129d595caaa5bd0b41d128308da1be6a91869 (diff)
rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang
If the following sequence of events occurs, then TREE_PREEMPT_RCU will hang waiting for a grace period to complete, eventually OOMing the system: o A TREE_PREEMPT_RCU build of the kernel is booted on a system with more than 64 physical CPUs present (32 on a 32-bit system). Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted with RCU_FANOUT set to a sufficiently small value that the physical CPUs populate two or more leaf rcu_node structures. o A task is preempted in an RCU read-side critical section while running on a CPU corresponding to a given leaf rcu_node structure. o All CPUs corresponding to this same leaf rcu_node structure record quiescent states for the current grace period. o All of these same CPUs go offline (hence the need for enough physical CPUs to populate more than one leaf rcu_node structure). This causes the preempted task to be moved to the root rcu_node structure. At this point, there is nothing left to cause the quiescent state to be propagated up the rcu_node tree, so the current grace period never completes. The simplest fix, especially after considering the deadlock possibilities, is to detect this situation when the last CPU is offlined, and to set that CPU's ->qsmask bit in its leaf rcu_node structure. This will cause the next invocation of force_quiescent_state() to end the grace period. Without this fix, this hang can be triggered in an hour or so on some machines with rcutorture and random CPU onlining/offlining. With this fix, these same machines pass a full 10 hours of this sort of abuse. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcutree.c15
-rw-r--r--kernel/rcutree.h6
-rw-r--r--kernel/rcutree_plugin.h25
3 files changed, 34 insertions, 12 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ddbf111e9e18..0536125b0497 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
913 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 913 spin_unlock(&rnp->lock); /* irqs remain disabled. */
914 break; 914 break;
915 } 915 }
916 rcu_preempt_offline_tasks(rsp, rnp, rdp); 916
917 /*
918 * If there was a task blocking the current grace period,
919 * and if all CPUs have checked in, we need to propagate
920 * the quiescent state up the rcu_node hierarchy. But that
921 * is inconvenient at the moment due to deadlock issues if
922 * this should end the current grace period. So set the
923 * offlined CPU's bit in ->qsmask in order to force the
924 * next force_quiescent_state() invocation to clean up this
925 * mess in a deadlock-free manner.
926 */
927 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
928 rnp->qsmask |= mask;
929
917 mask = rnp->grpmask; 930 mask = rnp->grpmask;
918 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 931 spin_unlock(&rnp->lock); /* irqs remain disabled. */
919 rnp = rnp->parent; 932 rnp = rnp->parent;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 599161f309fb..1823c6e20609 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -306,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
306#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 306#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
307static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 307static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
308#ifdef CONFIG_HOTPLUG_CPU 308#ifdef CONFIG_HOTPLUG_CPU
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 309static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 310 struct rcu_node *rnp,
311 struct rcu_data *rdp); 311 struct rcu_data *rdp);
312static void rcu_preempt_offline_cpu(int cpu); 312static void rcu_preempt_offline_cpu(int cpu);
313#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 313#endif /* #ifdef CONFIG_HOTPLUG_CPU */
314static void rcu_preempt_check_callbacks(int cpu); 314static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ebd20ee7707d..ef2a58c2b9d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
304 * parent is to remove the need for rcu_read_unlock_special() to 304 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 305 * make more than two attempts to acquire the target rcu_node's lock.
306 * 306 *
307 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure.
309 *
307 * The caller must hold rnp->lock with irqs disabled. 310 * The caller must hold rnp->lock with irqs disabled.
308 */ 311 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 312static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 313 struct rcu_node *rnp,
311 struct rcu_data *rdp) 314 struct rcu_data *rdp)
312{ 315{
313 int i; 316 int i;
314 struct list_head *lp; 317 struct list_head *lp;
315 struct list_head *lp_root; 318 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp);
316 struct rcu_node *rnp_root = rcu_get_root(rsp); 320 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp; 321 struct task_struct *tp;
318 322
319 if (rnp == rnp_root) { 323 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?"); 324 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */ 325 return 0; /* Shouldn't happen: at least one CPU online. */
322 } 326 }
323 WARN_ON_ONCE(rnp != rdp->mynode && 327 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) || 328 (!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 } 347 }
344 } 348 }
349
350 return retval;
345} 351}
346 352
347/* 353/*
@@ -532,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
532 538
533/* 539/*
534 * Because preemptable RCU does not exist, it never needs to migrate 540 * Because preemptable RCU does not exist, it never needs to migrate
535 * tasks that were blocked within RCU read-side critical sections. 541 * tasks that were blocked within RCU read-side critical sections, and
542 * such non-existent tasks cannot possibly have been blocking the current
543 * grace period.
536 */ 544 */
537static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 545static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
538 struct rcu_node *rnp, 546 struct rcu_node *rnp,
539 struct rcu_data *rdp) 547 struct rcu_data *rdp)
540{ 548{
549 return 0;
541} 550}
542 551
543/* 552/*