rcu: Fix grace-period-stall bug on large systems with CPU hotplug

When the last CPU of a given leaf rcu_node structure goes offline, all of the tasks queued on that leaf rcu_node structure (due to having blocked in their current RCU read-side critical sections) are requeued onto the root rcu_node structure. This requeuing is carried out by rcu_preempt_offline_tasks(). However, it is possible that these queued tasks are the only thing preventing the leaf rcu_node structure from reporting a quiescent state up the rcu_node hierarchy. Unfortunately, the old code would fail to do this reporting, resulting in a grace-period stall given the following sequence of events: 1. Kernel built for more than 32 CPUs on 32-bit systems or for more than 64 CPUs on 64-bit systems, so that there is more than one rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set to a number smaller than CONFIG_NR_CPUS.) 2. The kernel is built with CONFIG_TREE_PREEMPT_RCU. 3. A task running on a CPU associated with a given leaf rcu_node structure blocks while in an RCU read-side critical section -and- that CPU has not yet passed through a quiescent state for the current RCU grace period. This will cause the task to be queued on the leaf rcu_node's blocked_tasks[] array, in particular, on the element of this array corresponding to the current grace period. 4. Each of the remaining CPUs corresponding to this same leaf rcu_node structure pass through a quiescent state. However, the task is still in its RCU read-side critical section, so these quiescent states cannot be reported further up the rcu_node hierarchy. Nevertheless, all bits in the leaf rcu_node structure's ->qsmask field are now zero. 5. Each of the remaining CPUs go offline. (The events in step #4 and #5 can happen in any order as long as each CPU passes through a quiescent state before going offline.) 6. When the last CPU goes offline, __rcu_offline_cpu() will invoke rcu_preempt_offline_tasks(), which will move the task to the root rcu_node structure, but without reporting a quiescent state up the rcu_node hierarchy (and this failure to report a quiescent state is the bug). But because this leaf rcu_node structure's ->qsmask field is already zero and its ->block_tasks[] entries are all empty, force_quiescent_state() will skip this rcu_node structure. Therefore, grace periods are now hung. This patch abstracts some code out of rcu_read_unlock_special(), calling the result task_quiet() by analogy with cpu_quiet(), and invokes task_quiet() from both rcu_read_lock_special() and __rcu_offline_cpu(). Invoking task_quiet() from __rcu_offline_cpu() reports the quiescent state up the rcu_node hierarchy, fixing the bug. This ends up requiring a separate lock_class_key per level of the rcu_node hierarchy, which this patch also provides. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12589088301770-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2009-11-22 11:53:48 -0500
committer: Ingo Molnar <mingo@elte.hu> 2009-11-22 12:58:15 -0500
commit: b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e (patch)
tree: c0165d39532a2314f8187d765f3c8ddf88b72831 /kernel/rcutree.c
parent: 2f51f9884f6a36b0fe9636d5a1937e5cbd25723b (diff)
1 files changed, 21 insertions, 19 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9b36d6d7fb97..b79bfcd28e95 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,7 +51,7 @@
 /* Data structures. */
-static struct lock_class_key rcu_root_class;
+static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 #define RCU_STATE_INITIALIZER(name) { \
        .level = { &name.node[0] }, \
@@ -936,6 +936,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
+        int need_quiet = 0;
        struct rcu_data *rdp = rsp->rda[cpu];
        struct rcu_node *rnp;
@@ -949,29 +950,30 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                spin_lock(&rnp->lock);          /* irqs already disabled. */
                rnp->qsmaskinit &= ~mask;
                if (rnp->qsmaskinit != 0) {
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        if (rnp != rdp->mynode)
+                                spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
+                if (rnp == rdp->mynode)
-                /*
+                        need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                 * If there was a task blocking the current grace period,
+                else
-                 * and if all CPUs have checked in, we need to propagate
+                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                 * the quiescent state up the rcu_node hierarchy.  But that
-                 * is inconvenient at the moment due to deadlock issues if
-                 * this should end the current grace period.  So set the
-                 * offlined CPU's bit in ->qsmask in order to force the
-                 * next force_quiescent_state() invocation to clean up this
-                 * mess in a deadlock-free manner.
-                 */
-                if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
-                        rnp->qsmask |= mask;
                mask = rnp->grpmask;
-                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL);
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        /*
+         * We still hold the leaf rcu_node structure lock here, and
+         * irqs are still disabled.  The reason for this subterfuge is
+         * because invoking task_quiet() with ->onofflock held leads
+         * to deadlock.
+         */
+        spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+        rnp = rdp->mynode;
+        if (need_quiet)
+                task_quiet(rnp, flags);
+        else
+                spin_unlock_irqrestore(&rnp->lock, flags);
        rcu_adopt_orphan_cbs(rsp);
 }
@@ -1731,6 +1733,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
                        spin_lock_init(&rnp->lock);
+                        lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
                        rnp->gpnum = 0;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
@@ -1753,7 +1756,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                        INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
                }
        }
-        lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
 }
 /*
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2009-11-22 11:53:48 -0500
committer	Ingo Molnar <mingo@elte.hu>	2009-11-22 12:58:15 -0500
commit	b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e (patch)
tree	c0165d39532a2314f8187d765f3c8ddf88b72831 /kernel/rcutree.c
parent	2f51f9884f6a36b0fe9636d5a1937e5cbd25723b (diff)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9b36d6d7fb97..b79bfcd28e95 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c
@@ -51,7 +51,7 @@
51		51
52	/* Data structures. */	52	/* Data structures. */
53		53
54	static struct lock_class_key rcu_root_class;	54	static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55		55
56	#define RCU_STATE_INITIALIZER(name) { \	56	#define RCU_STATE_INITIALIZER(name) { \
57	.level = { &name.node[0] }, \	57	.level = { &name.node[0] }, \
@@ -936,6 +936,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
936	{	936	{
937	unsigned long flags;	937	unsigned long flags;
938	unsigned long mask;	938	unsigned long mask;
		939	int need_quiet = 0;
939	struct rcu_data *rdp = rsp->rda[cpu];	940	struct rcu_data *rdp = rsp->rda[cpu];
940	struct rcu_node *rnp;	941	struct rcu_node *rnp;
941		942
@@ -949,29 +950,30 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
949	spin_lock(&rnp->lock); /* irqs already disabled. */	950	spin_lock(&rnp->lock); /* irqs already disabled. */
950	rnp->qsmaskinit &= ~mask;	951	rnp->qsmaskinit &= ~mask;
951	if (rnp->qsmaskinit != 0) {	952	if (rnp->qsmaskinit != 0) {
952	spin_unlock(&rnp->lock); /* irqs remain disabled. */	953	if (rnp != rdp->mynode)
		954	spin_unlock(&rnp->lock); /* irqs remain disabled. */
953	break;	955	break;
954	}	956	}
955		957	if (rnp == rdp->mynode)
956	/*	958	need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
957	* If there was a task blocking the current grace period,	959	else
958	* and if all CPUs have checked in, we need to propagate	960	spin_unlock(&rnp->lock); /* irqs remain disabled. */
959	* the quiescent state up the rcu_node hierarchy. But that
960	* is inconvenient at the moment due to deadlock issues if
961	* this should end the current grace period. So set the
962	* offlined CPU's bit in ->qsmask in order to force the
963	* next force_quiescent_state() invocation to clean up this
964	* mess in a deadlock-free manner.
965	*/
966	if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
967	rnp->qsmask \|= mask;
968
969	mask = rnp->grpmask;	961	mask = rnp->grpmask;
970	spin_unlock(&rnp->lock); /* irqs remain disabled. */
971	rnp = rnp->parent;	962	rnp = rnp->parent;
972	} while (rnp != NULL);	963	} while (rnp != NULL);
973		964
974	spin_unlock_irqrestore(&rsp->onofflock, flags);	965	/*
		966	* We still hold the leaf rcu_node structure lock here, and
		967	* irqs are still disabled. The reason for this subterfuge is
		968	* because invoking task_quiet() with ->onofflock held leads
		969	* to deadlock.
		970	*/
		971	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
		972	rnp = rdp->mynode;
		973	if (need_quiet)
		974	task_quiet(rnp, flags);
		975	else
		976	spin_unlock_irqrestore(&rnp->lock, flags);
975		977
976	rcu_adopt_orphan_cbs(rsp);	978	rcu_adopt_orphan_cbs(rsp);
977	}	979	}
@@ -1731,6 +1733,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1731	rnp = rsp->level[i];	1733	rnp = rsp->level[i];
1732	for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {	1734	for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1733	spin_lock_init(&rnp->lock);	1735	spin_lock_init(&rnp->lock);
		1736	lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1734	rnp->gpnum = 0;	1737	rnp->gpnum = 0;
1735	rnp->qsmask = 0;	1738	rnp->qsmask = 0;
1736	rnp->qsmaskinit = 0;	1739	rnp->qsmaskinit = 0;
@@ -1753,7 +1756,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1753	INIT_LIST_HEAD(&rnp->blocked_tasks[1]);	1756	INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1754	}	1757	}
1755	}	1758	}
1756	lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
1757	}	1759	}
1758		1760
1759	/*	1761	/*