diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2009-11-22 11:53:48 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-11-22 12:58:15 -0500 |
commit | b668c9cf3e58739dac54a1d6f42f2b4bdd980b3e (patch) | |
tree | c0165d39532a2314f8187d765f3c8ddf88b72831 /kernel | |
parent | 2f51f9884f6a36b0fe9636d5a1937e5cbd25723b (diff) |
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcutree.c | 40 | ||||
-rw-r--r-- | kernel/rcutree.h | 3 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 85 |
3 files changed, 85 insertions, 43 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9b36d6d7fb97..b79bfcd28e95 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -51,7 +51,7 @@ | |||
51 | 51 | ||
52 | /* Data structures. */ | 52 | /* Data structures. */ |
53 | 53 | ||
54 | static struct lock_class_key rcu_root_class; | 54 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; |
55 | 55 | ||
56 | #define RCU_STATE_INITIALIZER(name) { \ | 56 | #define RCU_STATE_INITIALIZER(name) { \ |
57 | .level = { &name.node[0] }, \ | 57 | .level = { &name.node[0] }, \ |
@@ -936,6 +936,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
936 | { | 936 | { |
937 | unsigned long flags; | 937 | unsigned long flags; |
938 | unsigned long mask; | 938 | unsigned long mask; |
939 | int need_quiet = 0; | ||
939 | struct rcu_data *rdp = rsp->rda[cpu]; | 940 | struct rcu_data *rdp = rsp->rda[cpu]; |
940 | struct rcu_node *rnp; | 941 | struct rcu_node *rnp; |
941 | 942 | ||
@@ -949,29 +950,30 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
949 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 950 | spin_lock(&rnp->lock); /* irqs already disabled. */ |
950 | rnp->qsmaskinit &= ~mask; | 951 | rnp->qsmaskinit &= ~mask; |
951 | if (rnp->qsmaskinit != 0) { | 952 | if (rnp->qsmaskinit != 0) { |
952 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 953 | if (rnp != rdp->mynode) |
954 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
953 | break; | 955 | break; |
954 | } | 956 | } |
955 | 957 | if (rnp == rdp->mynode) | |
956 | /* | 958 | need_quiet = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
957 | * If there was a task blocking the current grace period, | 959 | else |
958 | * and if all CPUs have checked in, we need to propagate | 960 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
959 | * the quiescent state up the rcu_node hierarchy. But that | ||
960 | * is inconvenient at the moment due to deadlock issues if | ||
961 | * this should end the current grace period. So set the | ||
962 | * offlined CPU's bit in ->qsmask in order to force the | ||
963 | * next force_quiescent_state() invocation to clean up this | ||
964 | * mess in a deadlock-free manner. | ||
965 | */ | ||
966 | if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) | ||
967 | rnp->qsmask |= mask; | ||
968 | |||
969 | mask = rnp->grpmask; | 961 | mask = rnp->grpmask; |
970 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
971 | rnp = rnp->parent; | 962 | rnp = rnp->parent; |
972 | } while (rnp != NULL); | 963 | } while (rnp != NULL); |
973 | 964 | ||
974 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 965 | /* |
966 | * We still hold the leaf rcu_node structure lock here, and | ||
967 | * irqs are still disabled. The reason for this subterfuge is | ||
968 | * because invoking task_quiet() with ->onofflock held leads | ||
969 | * to deadlock. | ||
970 | */ | ||
971 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | ||
972 | rnp = rdp->mynode; | ||
973 | if (need_quiet) | ||
974 | task_quiet(rnp, flags); | ||
975 | else | ||
976 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
975 | 977 | ||
976 | rcu_adopt_orphan_cbs(rsp); | 978 | rcu_adopt_orphan_cbs(rsp); |
977 | } | 979 | } |
@@ -1731,6 +1733,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1731 | rnp = rsp->level[i]; | 1733 | rnp = rsp->level[i]; |
1732 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 1734 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
1733 | spin_lock_init(&rnp->lock); | 1735 | spin_lock_init(&rnp->lock); |
1736 | lockdep_set_class(&rnp->lock, &rcu_node_class[i]); | ||
1734 | rnp->gpnum = 0; | 1737 | rnp->gpnum = 0; |
1735 | rnp->qsmask = 0; | 1738 | rnp->qsmask = 0; |
1736 | rnp->qsmaskinit = 0; | 1739 | rnp->qsmaskinit = 0; |
@@ -1753,7 +1756,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1753 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); | 1756 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); |
1754 | } | 1757 | } |
1755 | } | 1758 | } |
1756 | lockdep_set_class(&rcu_get_root(rsp)->lock, &rcu_root_class); | ||
1757 | } | 1759 | } |
1758 | 1760 | ||
1759 | /* | 1761 | /* |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 17a28a08b559..a81188c42929 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -305,6 +305,9 @@ static void rcu_bootup_announce(void); | |||
305 | long rcu_batches_completed(void); | 305 | long rcu_batches_completed(void); |
306 | static void rcu_preempt_note_context_switch(int cpu); | 306 | static void rcu_preempt_note_context_switch(int cpu); |
307 | static int rcu_preempted_readers(struct rcu_node *rnp); | 307 | static int rcu_preempted_readers(struct rcu_node *rnp); |
308 | #ifdef CONFIG_HOTPLUG_CPU | ||
309 | static void task_quiet(struct rcu_node *rnp, unsigned long flags); | ||
310 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
308 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 311 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
309 | static void rcu_print_task_stall(struct rcu_node *rnp); | 312 | static void rcu_print_task_stall(struct rcu_node *rnp); |
310 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 313 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5ca2d26c5971..0bdb592eee66 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -160,11 +160,51 @@ static int rcu_preempted_readers(struct rcu_node *rnp) | |||
160 | return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); | 160 | return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); |
161 | } | 161 | } |
162 | 162 | ||
163 | /* | ||
164 | * Record a quiescent state for all tasks that were previously queued | ||
165 | * on the specified rcu_node structure and that were blocking the current | ||
166 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
167 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
168 | * disabled. | ||
169 | */ | ||
170 | static void task_quiet(struct rcu_node *rnp, unsigned long flags) | ||
171 | __releases(rnp->lock) | ||
172 | { | ||
173 | unsigned long mask; | ||
174 | struct rcu_node *rnp_p; | ||
175 | |||
176 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | ||
177 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
178 | return; /* Still need more quiescent states! */ | ||
179 | } | ||
180 | |||
181 | rnp_p = rnp->parent; | ||
182 | if (rnp_p == NULL) { | ||
183 | /* | ||
184 | * Either there is only one rcu_node in the tree, | ||
185 | * or tasks were kicked up to root rcu_node due to | ||
186 | * CPUs going offline. | ||
187 | */ | ||
188 | cpu_quiet_msk_finish(&rcu_preempt_state, flags); | ||
189 | return; | ||
190 | } | ||
191 | |||
192 | /* Report up the rest of the hierarchy. */ | ||
193 | mask = rnp->grpmask; | ||
194 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
195 | spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
196 | cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
201 | * notify RCU core processing or task having blocked during the RCU | ||
202 | * read-side critical section. | ||
203 | */ | ||
163 | static void rcu_read_unlock_special(struct task_struct *t) | 204 | static void rcu_read_unlock_special(struct task_struct *t) |
164 | { | 205 | { |
165 | int empty; | 206 | int empty; |
166 | unsigned long flags; | 207 | unsigned long flags; |
167 | unsigned long mask; | ||
168 | struct rcu_node *rnp; | 208 | struct rcu_node *rnp; |
169 | int special; | 209 | int special; |
170 | 210 | ||
@@ -213,30 +253,15 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
213 | /* | 253 | /* |
214 | * If this was the last task on the current list, and if | 254 | * If this was the last task on the current list, and if |
215 | * we aren't waiting on any CPUs, report the quiescent state. | 255 | * we aren't waiting on any CPUs, report the quiescent state. |
216 | * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() | 256 | * Note that task_quiet() releases rnp->lock. |
217 | * drop rnp->lock and restore irq. | ||
218 | */ | 257 | */ |
219 | if (!empty && rnp->qsmask == 0 && | 258 | if (empty) |
220 | !rcu_preempted_readers(rnp)) { | ||
221 | struct rcu_node *rnp_p; | ||
222 | |||
223 | if (rnp->parent == NULL) { | ||
224 | /* Only one rcu_node in the tree. */ | ||
225 | cpu_quiet_msk_finish(&rcu_preempt_state, flags); | ||
226 | return; | ||
227 | } | ||
228 | /* Report up the rest of the hierarchy. */ | ||
229 | mask = rnp->grpmask; | ||
230 | spin_unlock_irqrestore(&rnp->lock, flags); | 259 | spin_unlock_irqrestore(&rnp->lock, flags); |
231 | rnp_p = rnp->parent; | 260 | else |
232 | spin_lock_irqsave(&rnp_p->lock, flags); | 261 | task_quiet(rnp, flags); |
233 | WARN_ON_ONCE(rnp->qsmask); | 262 | } else { |
234 | cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); | 263 | local_irq_restore(flags); |
235 | return; | ||
236 | } | ||
237 | spin_unlock(&rnp->lock); | ||
238 | } | 264 | } |
239 | local_irq_restore(flags); | ||
240 | } | 265 | } |
241 | 266 | ||
242 | /* | 267 | /* |
@@ -303,6 +328,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
303 | * rcu_node. The reason for not just moving them to the immediate | 328 | * rcu_node. The reason for not just moving them to the immediate |
304 | * parent is to remove the need for rcu_read_unlock_special() to | 329 | * parent is to remove the need for rcu_read_unlock_special() to |
305 | * make more than two attempts to acquire the target rcu_node's lock. | 330 | * make more than two attempts to acquire the target rcu_node's lock. |
331 | * Returns true if there were tasks blocking the current RCU grace | ||
332 | * period. | ||
306 | * | 333 | * |
307 | * Returns 1 if there was previously a task blocking the current grace | 334 | * Returns 1 if there was previously a task blocking the current grace |
308 | * period on the specified rcu_node structure. | 335 | * period on the specified rcu_node structure. |
@@ -316,7 +343,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
316 | int i; | 343 | int i; |
317 | struct list_head *lp; | 344 | struct list_head *lp; |
318 | struct list_head *lp_root; | 345 | struct list_head *lp_root; |
319 | int retval = rcu_preempted_readers(rnp); | 346 | int retval; |
320 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 347 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
321 | struct task_struct *tp; | 348 | struct task_struct *tp; |
322 | 349 | ||
@@ -334,6 +361,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
334 | * rcu_nodes in terms of gp_num value. This fact allows us to | 361 | * rcu_nodes in terms of gp_num value. This fact allows us to |
335 | * move the blocked_tasks[] array directly, element by element. | 362 | * move the blocked_tasks[] array directly, element by element. |
336 | */ | 363 | */ |
364 | retval = rcu_preempted_readers(rnp); | ||
337 | for (i = 0; i < 2; i++) { | 365 | for (i = 0; i < 2; i++) { |
338 | lp = &rnp->blocked_tasks[i]; | 366 | lp = &rnp->blocked_tasks[i]; |
339 | lp_root = &rnp_root->blocked_tasks[i]; | 367 | lp_root = &rnp_root->blocked_tasks[i]; |
@@ -346,7 +374,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
346 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 374 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
347 | } | 375 | } |
348 | } | 376 | } |
349 | |||
350 | return retval; | 377 | return retval; |
351 | } | 378 | } |
352 | 379 | ||
@@ -512,6 +539,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp) | |||
512 | return 0; | 539 | return 0; |
513 | } | 540 | } |
514 | 541 | ||
542 | #ifdef CONFIG_HOTPLUG_CPU | ||
543 | |||
544 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | ||
545 | static void task_quiet(struct rcu_node *rnp, unsigned long flags) | ||
546 | { | ||
547 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
548 | } | ||
549 | |||
550 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
551 | |||
515 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 552 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
516 | 553 | ||
517 | /* | 554 | /* |