aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2009-11-22 11:53:48 -0500
committerIngo Molnar <mingo@elte.hu>2009-11-22 12:58:15 -0500
commitb668c9cf3e58739dac54a1d6f42f2b4bdd980b3e (patch)
treec0165d39532a2314f8187d765f3c8ddf88b72831
parent2f51f9884f6a36b0fe9636d5a1937e5cbd25723b (diff)
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes offline, all of the tasks queued on that leaf rcu_node structure (due to having blocked in their current RCU read-side critical sections) are requeued onto the root rcu_node structure. This requeuing is carried out by rcu_preempt_offline_tasks(). However, it is possible that these queued tasks are the only thing preventing the leaf rcu_node structure from reporting a quiescent state up the rcu_node hierarchy. Unfortunately, the old code would fail to do this reporting, resulting in a grace-period stall given the following sequence of events: 1. Kernel built for more than 32 CPUs on 32-bit systems or for more than 64 CPUs on 64-bit systems, so that there is more than one rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set to a number smaller than CONFIG_NR_CPUS.) 2. The kernel is built with CONFIG_TREE_PREEMPT_RCU. 3. A task running on a CPU associated with a given leaf rcu_node structure blocks while in an RCU read-side critical section -and- that CPU has not yet passed through a quiescent state for the current RCU grace period. This will cause the task to be queued on the leaf rcu_node's blocked_tasks[] array, in particular, on the element of this array corresponding to the current grace period. 4. Each of the remaining CPUs corresponding to this same leaf rcu_node structure pass through a quiescent state. However, the task is still in its RCU read-side critical section, so these quiescent states cannot be reported further up the rcu_node hierarchy. Nevertheless, all bits in the leaf rcu_node structure's ->qsmask field are now zero. 5. Each of the remaining CPUs go offline. (The events in step #4 and #5 can happen in any order as long as each CPU passes through a quiescent state before going offline.) 6. When the last CPU goes offline, __rcu_offline_cpu() will invoke rcu_preempt_offline_tasks(), which will move the task to the root rcu_node structure, but without reporting a quiescent state up the rcu_node hierarchy (and this failure to report a quiescent state is the bug). But because this leaf rcu_node structure's ->qsmask field is already zero and its ->block_tasks[] entries are all empty, force_quiescent_state() will skip this rcu_node structure. Therefore, grace periods are now hung. This patch abstracts some code out of rcu_read_unlock_special(), calling the result task_quiet() by analogy with cpu_quiet(), and invokes task_quiet() from both rcu_read_lock_special() and __rcu_offline_cpu(). Invoking task_quiet() from __rcu_offline_cpu() reports the quiescent state up the rcu_node hierarchy, fixing the bug. This ends up requiring a separate lock_class_key per level of the rcu_node hierarchy, which this patch also provides. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12589088301770-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/rcutree.c40
-rw-r--r--kernel/rcutree.h3
-rw-r--r--kernel/rcutree_plugin.h85
3 files changed, 85 insertions, 43 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9b36d6d7fb97..b79bfcd28e95 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,7 +51,7 @@
51 51
52/* Data structures. */ 52/* Data structures. */
53 53
54static struct lock_class_key rcu_root_class; 54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 55
56#define RCU_STATE_INITIALIZER(name) { \ 56#define RCU_STATE_INITIALIZER(name) { \
57 .level = { &name.node[0] }, \ 57 .level = { &name.node[0] }, \
@@ -936,6 +936,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
936{ 936{
937 unsigned long flags; 937 unsigned long flags;
938 unsigned long mask; 938 unsigned long mask;
939 int need_quiet = 0;
939 struct rcu_data *rdp = rsp->rda[cpu]; 940 struct rcu_data *rdp = rsp->rda[cpu];
940 struct rcu_node *rnp; 941 struct rcu_node *rnp;
941 942
@@ -949,29 +950,30 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
949 spin_lock(&rnp->lock); /* irqs already disabled. */ 950 spin_lock(&rnp->lock); /* irqs already disabled. */
950 rnp->qsmaskinit &= ~mask; 951 rnp->qsmaskinit &= ~mask;
951 if (rnp->qsmaskinit != 0) { 952 if (rnp->qsmaskinit != 0) {
952 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 953 if (rnp != rdp->mynode)
954 spin_unlock(&rnp->lock); /* irqs remain disabled. */
953 break; 955 break;
954 } 956 }
955 957 if (rnp == rdp->mynode)
956 /* 958 need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp);
957 * If there was a task blocking the current grace period, 959 else
958 * and if all CPUs have checked in, we need to propagate 960 spin_unlock(&rnp->lock); /* irqs remain disabled. */
959 * the quiescent state up the rcu_node hierarchy. But that
960 * is inconvenient at the moment due to deadlock issues if
961 * this should end the current grace period. So set the
962 * offlined CPU's bit in ->qsmask in order to force the
963 * next force_quiescent_state() invocation to clean up this
964 * mess in a deadlock-free manner.
965 */
966 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
967 rnp->qsmask |= mask;
968
969 mask = rnp->grpmask; 961 mask = rnp->grpmask;
970 spin_unlock(&rnp->lock); /* irqs remain disabled. */
971 rnp = rnp->parent; 962 rnp = rnp->parent;
972 } while (rnp != NULL); 963 } while (rnp != NULL);
973 964
974 spin_unlock_irqrestore(&rsp->onofflock, flags); 965 /*
966 * We still hold the leaf rcu_node structure lock here, and
967 * irqs are still disabled. The reason for this subterfuge is
968 * because invoking task_quiet() with ->onofflock held leads
969 * to deadlock.
970 */
971 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
972 rnp = rdp->mynode;
973 if (need_quiet)
974 task_quiet(rnp, flags);
975 else
976 spin_unlock_irqrestore(&rnp->lock, flags);
975 977
976 rcu_adopt_orphan_cbs(rsp); 978 rcu_adopt_orphan_cbs(rsp);
977} 979}
@@ -1731,6 +1733,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1731 rnp = rsp->level[i]; 1733 rnp = rsp->level[i];
1732 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1734 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1733 spin_lock_init(&rnp->lock); 1735 spin_lock_init(&rnp->lock);
1736 lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
1734 rnp->gpnum = 0; 1737 rnp->gpnum = 0;
1735 rnp->qsmask = 0; 1738 rnp->qsmask = 0;
1736 rnp->qsmaskinit = 0; 1739 rnp->qsmaskinit = 0;
@@ -1753,7 +1756,6 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1753 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1756 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1754 } 1757 }
1755 } 1758 }
1756 lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class);
1757} 1759}
1758 1760
1759/* 1761/*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 17a28a08b559..a81188c42929 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -305,6 +305,9 @@ static void rcu_bootup_announce(void);
305long rcu_batches_completed(void); 305long rcu_batches_completed(void);
306static void rcu_preempt_note_context_switch(int cpu); 306static void rcu_preempt_note_context_switch(int cpu);
307static int rcu_preempted_readers(struct rcu_node *rnp); 307static int rcu_preempted_readers(struct rcu_node *rnp);
308#ifdef CONFIG_HOTPLUG_CPU
309static void task_quiet(struct rcu_node *rnp, unsigned long flags);
310#endif /* #ifdef CONFIG_HOTPLUG_CPU */
308#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 311#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
309static void rcu_print_task_stall(struct rcu_node *rnp); 312static void rcu_print_task_stall(struct rcu_node *rnp);
310#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 313#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5ca2d26c5971..0bdb592eee66 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -160,11 +160,51 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
161} 161}
162 162
163/*
164 * Record a quiescent state for all tasks that were previously queued
165 * on the specified rcu_node structure and that were blocking the current
166 * RCU grace period. The caller must hold the specified rnp->lock with
167 * irqs disabled, and this lock is released upon return, but irqs remain
168 * disabled.
169 */
170static void task_quiet(struct rcu_node *rnp, unsigned long flags)
171 __releases(rnp->lock)
172{
173 unsigned long mask;
174 struct rcu_node *rnp_p;
175
176 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
177 spin_unlock_irqrestore(&rnp->lock, flags);
178 return; /* Still need more quiescent states! */
179 }
180
181 rnp_p = rnp->parent;
182 if (rnp_p == NULL) {
183 /*
184 * Either there is only one rcu_node in the tree,
185 * or tasks were kicked up to root rcu_node due to
186 * CPUs going offline.
187 */
188 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
189 return;
190 }
191
192 /* Report up the rest of the hierarchy. */
193 mask = rnp->grpmask;
194 spin_unlock(&rnp->lock); /* irqs remain disabled. */
195 spin_lock(&rnp_p->lock); /* irqs already disabled. */
196 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
197}
198
199/*
200 * Handle special cases during rcu_read_unlock(), such as needing to
201 * notify RCU core processing or task having blocked during the RCU
202 * read-side critical section.
203 */
163static void rcu_read_unlock_special(struct task_struct *t) 204static void rcu_read_unlock_special(struct task_struct *t)
164{ 205{
165 int empty; 206 int empty;
166 unsigned long flags; 207 unsigned long flags;
167 unsigned long mask;
168 struct rcu_node *rnp; 208 struct rcu_node *rnp;
169 int special; 209 int special;
170 210
@@ -213,30 +253,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
213 /* 253 /*
214 * If this was the last task on the current list, and if 254 * If this was the last task on the current list, and if
215 * we aren't waiting on any CPUs, report the quiescent state. 255 * we aren't waiting on any CPUs, report the quiescent state.
216 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() 256 * Note that task_quiet() releases rnp->lock.
217 * drop rnp->lock and restore irq.
218 */ 257 */
219 if (!empty && rnp->qsmask == 0 && 258 if (empty)
220 !rcu_preempted_readers(rnp)) {
221 struct rcu_node *rnp_p;
222
223 if (rnp->parent == NULL) {
224 /* Only one rcu_node in the tree. */
225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
226 return;
227 }
228 /* Report up the rest of the hierarchy. */
229 mask = rnp->grpmask;
230 spin_unlock_irqrestore(&rnp->lock, flags); 259 spin_unlock_irqrestore(&rnp->lock, flags);
231 rnp_p = rnp->parent; 260 else
232 spin_lock_irqsave(&rnp_p->lock, flags); 261 task_quiet(rnp, flags);
233 WARN_ON_ONCE(rnp->qsmask); 262 } else {
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); 263 local_irq_restore(flags);
235 return;
236 }
237 spin_unlock(&rnp->lock);
238 } 264 }
239 local_irq_restore(flags);
240} 265}
241 266
242/* 267/*
@@ -303,6 +328,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
303 * rcu_node. The reason for not just moving them to the immediate 328 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to 329 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 330 * make more than two attempts to acquire the target rcu_node's lock.
331 * Returns true if there were tasks blocking the current RCU grace
332 * period.
306 * 333 *
307 * Returns 1 if there was previously a task blocking the current grace 334 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure. 335 * period on the specified rcu_node structure.
@@ -316,7 +343,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
316 int i; 343 int i;
317 struct list_head *lp; 344 struct list_head *lp;
318 struct list_head *lp_root; 345 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp); 346 int retval;
320 struct rcu_node *rnp_root = rcu_get_root(rsp); 347 struct rcu_node *rnp_root = rcu_get_root(rsp);
321 struct task_struct *tp; 348 struct task_struct *tp;
322 349
@@ -334,6 +361,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
334 * rcu_nodes in terms of gp_num value. This fact allows us to 361 * rcu_nodes in terms of gp_num value. This fact allows us to
335 * move the blocked_tasks[] array directly, element by element. 362 * move the blocked_tasks[] array directly, element by element.
336 */ 363 */
364 retval = rcu_preempted_readers(rnp);
337 for (i = 0; i < 2; i++) { 365 for (i = 0; i < 2; i++) {
338 lp = &rnp->blocked_tasks[i]; 366 lp = &rnp->blocked_tasks[i];
339 lp_root = &rnp_root->blocked_tasks[i]; 367 lp_root = &rnp_root->blocked_tasks[i];
@@ -346,7 +374,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 374 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
347 } 375 }
348 } 376 }
349
350 return retval; 377 return retval;
351} 378}
352 379
@@ -512,6 +539,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
512 return 0; 539 return 0;
513} 540}
514 541
542#ifdef CONFIG_HOTPLUG_CPU
543
544/* Because preemptible RCU does not exist, no quieting of tasks. */
545static void task_quiet(struct rcu_node *rnp, unsigned long flags)
546{
547 spin_unlock_irqrestore(&rnp->lock, flags);
548}
549
550#endif /* #ifdef CONFIG_HOTPLUG_CPU */
551
515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 552#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
516 553
517/* 554/*