diff options
author | Anton Blanchard <anton@samba.org> | 2011-01-20 17:44:33 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-20 20:02:06 -0500 |
commit | 6dc19899958e420a931274b94019e267e2396d3e (patch) | |
tree | bb99ffb491f086b10bd92db3fc7cff214bb1a329 /kernel/smp.c | |
parent | 713735b4233fad3ae35b5cad656baa41413887ca (diff) |
kernel/smp.c: fix smp_call_function_many() SMP race
I noticed a failure where we hit the following WARN_ON in
generic_smp_call_function_interrupt:
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
continue;
data->csd.func(data->csd.info);
refs = atomic_dec_return(&data->refs);
WARN_ON(refs < 0); <-------------------------
We atomically tested and cleared our bit in the cpumask, and yet the
number of cpus left (ie refs) was 0. How can this be?
It turns out commit 54fdade1c3332391948ec43530c02c4794a38172
("generic-ipi: make struct call_function_data lockless") is at fault. It
removes locking from smp_call_function_many and in doing so creates a
rather complicated race.
The problem comes about because:
- The smp_call_function_many interrupt handler walks call_function.queue
without any locking.
- We reuse a percpu data structure in smp_call_function_many.
- We do not wait for any RCU grace period before starting the next
smp_call_function_many.
Imagine a scenario where CPU A does two smp_call_functions back to back,
and CPU B does an smp_call_function in between. We concentrate on how CPU
C handles the calls:
CPU A CPU B CPU C CPU D
smp_call_function
smp_call_function_interrupt
walks
call_function.queue sees
data from CPU A on list
smp_call_function
smp_call_function_interrupt
walks
call_function.queue sees
(stale) CPU A on list
smp_call_function int
clears last ref on A
list_del_rcu, unlock
smp_call_function reuses
percpu *data A
data->cpumask sees and
clears bit in cpumask
might be using old or new fn!
decrements refs below 0
set data->refs (too late!)
The important thing to note is since the interrupt handler walks a
potentially stale call_function.queue without any locking, then another
cpu can view the percpu *data structure at any time, even when the owner
is in the process of initialising it.
The following test case hits the WARN_ON 100% of the time on my PowerPC
box (having 128 threads does help :)
#include <linux/module.h>
#include <linux/init.h>
#define ITERATIONS 100
static void do_nothing_ipi(void *dummy)
{
}
static void do_ipis(struct work_struct *dummy)
{
int i;
for (i = 0; i < ITERATIONS; i++)
smp_call_function(do_nothing_ipi, NULL, 1);
printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id());
}
static struct work_struct work[NR_CPUS];
static int __init testcase_init(void)
{
int cpu;
for_each_online_cpu(cpu) {
INIT_WORK(&work[cpu], do_ipis);
schedule_work_on(cpu, &work[cpu]);
}
return 0;
}
static void __exit testcase_exit(void)
{
}
module_init(testcase_init)
module_exit(testcase_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Anton Blanchard");
I tried to fix it by ordering the read and the write of ->cpumask and
->refs. In doing so I missed a critical case but Paul McKenney was able
to spot my bug thankfully :) To ensure we arent viewing previous
iterations the interrupt handler needs to read ->refs then ->cpumask then
->refs _again_.
Thanks to Milton Miller and Paul McKenney for helping to debug this issue.
[miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ]
[miltonm@bga.com: remove excess tests]
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Milton Miller <miltonm@bga.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org> [2.6.32+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/smp.c')
-rw-r--r-- | kernel/smp.c | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e069987..17c6e5860231 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void) | |||
195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
196 | int refs; | 196 | int refs; |
197 | 197 | ||
198 | /* | ||
199 | * Since we walk the list without any locks, we might | ||
200 | * see an entry that was completed, removed from the | ||
201 | * list and is in the process of being reused. | ||
202 | * | ||
203 | * We must check that the cpu is in the cpumask before | ||
204 | * checking the refs, and both must be set before | ||
205 | * executing the callback on this cpu. | ||
206 | */ | ||
207 | |||
208 | if (!cpumask_test_cpu(cpu, data->cpumask)) | ||
209 | continue; | ||
210 | |||
211 | smp_rmb(); | ||
212 | |||
213 | if (atomic_read(&data->refs) == 0) | ||
214 | continue; | ||
215 | |||
198 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) | 216 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) |
199 | continue; | 217 | continue; |
200 | 218 | ||
@@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void) | |||
203 | refs = atomic_dec_return(&data->refs); | 221 | refs = atomic_dec_return(&data->refs); |
204 | WARN_ON(refs < 0); | 222 | WARN_ON(refs < 0); |
205 | if (!refs) { | 223 | if (!refs) { |
224 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
225 | |||
206 | raw_spin_lock(&call_function.lock); | 226 | raw_spin_lock(&call_function.lock); |
207 | list_del_rcu(&data->csd.list); | 227 | list_del_rcu(&data->csd.list); |
208 | raw_spin_unlock(&call_function.lock); | 228 | raw_spin_unlock(&call_function.lock); |
@@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask, | |||
454 | 474 | ||
455 | data = &__get_cpu_var(cfd_data); | 475 | data = &__get_cpu_var(cfd_data); |
456 | csd_lock(&data->csd); | 476 | csd_lock(&data->csd); |
477 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
457 | 478 | ||
458 | data->csd.func = func; | 479 | data->csd.func = func; |
459 | data->csd.info = info; | 480 | data->csd.info = info; |
460 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 481 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
461 | cpumask_clear_cpu(this_cpu, data->cpumask); | 482 | cpumask_clear_cpu(this_cpu, data->cpumask); |
483 | |||
484 | /* | ||
485 | * To ensure the interrupt handler gets an complete view | ||
486 | * we order the cpumask and refs writes and order the read | ||
487 | * of them in the interrupt handler. In addition we may | ||
488 | * only clear our own cpu bit from the mask. | ||
489 | */ | ||
490 | smp_wmb(); | ||
491 | |||
462 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 492 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); |
463 | 493 | ||
464 | raw_spin_lock_irqsave(&call_function.lock, flags); | 494 | raw_spin_lock_irqsave(&call_function.lock, flags); |