diff options
author | Peter Zijlstra <peterz@infradead.org> | 2013-10-11 08:38:20 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-10-16 08:22:16 -0400 |
commit | 6acce3ef84520537f8a09a12c9ddbe814a584dd2 (patch) | |
tree | b4e117df4a57be6a040529c148480227c3d100cc /kernel | |
parent | 746023159c40c523b08a3bc3d213dac212385895 (diff) |
sched: Remove get_online_cpus() usage
Remove get_online_cpus() usage from the scheduler; there's 4 sites that
use it:
- sched_init_smp(); where its completely superfluous since we're in
'early' boot and there simply cannot be any hotplugging.
- sched_getaffinity(); we already take a raw spinlock to protect the
task cpus_allowed mask, this disables preemption and therefore
also stabilizes cpu_online_mask as that's modified using
stop_machine. However switch to active mask for symmetry with
sched_setaffinity()/set_cpus_allowed_ptr(). We guarantee active
mask stability by inserting sync_rcu/sched() into _cpu_down.
- sched_setaffinity(); we don't appear to need get_online_cpus()
either, there's two sites where hotplug appears relevant:
* cpuset_cpus_allowed(); for the !cpuset case we use possible_mask,
for the cpuset case we hold task_lock, which is a spinlock and
thus for mainline disables preemption (might cause pain on RT).
* set_cpus_allowed_ptr(); Holds all scheduler locks and thus has
preemption properly disabled; also it already deals with hotplug
races explicitly where it releases them.
- migrate_swap(); we can make stop_two_cpus() do the heavy lifting for
us with a little trickery. By adding a sync_sched/rcu() after the
CPU_DOWN_PREPARE notifier we can provide preempt/rcu guarantees for
cpu_active_mask. Use these to validate that both our cpus are active
when queueing the stop work before we queue the stop_machine works
for take_cpu_down().
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/20131011123820.GV3081@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 17 | ||||
-rw-r--r-- | kernel/sched/core.c | 20 | ||||
-rw-r--r-- | kernel/stop_machine.c | 26 |
3 files changed, 48 insertions, 15 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..63aa50d7ce1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
308 | } | 308 | } |
309 | smpboot_park_threads(cpu); | 309 | smpboot_park_threads(cpu); |
310 | 310 | ||
311 | /* | ||
312 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled | ||
313 | * and RCU users of this state to go away such that all new such users | ||
314 | * will observe it. | ||
315 | * | ||
316 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | ||
317 | * not imply sync_sched(), so explicitly call both. | ||
318 | */ | ||
319 | #ifdef CONFIG_PREEMPT | ||
320 | synchronize_sched(); | ||
321 | #endif | ||
322 | synchronize_rcu(); | ||
323 | |||
324 | /* | ||
325 | * So now all preempt/rcu users must observe !cpu_active(). | ||
326 | */ | ||
327 | |||
311 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 328 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
312 | if (err) { | 329 | if (err) { |
313 | /* CPU didn't die: tell everyone. Can't complain. */ | 330 | /* CPU didn't die: tell everyone. Can't complain. */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a972acd468b0..c06b8d345fae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1085,8 +1085,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1085 | struct migration_swap_arg arg; | 1085 | struct migration_swap_arg arg; |
1086 | int ret = -EINVAL; | 1086 | int ret = -EINVAL; |
1087 | 1087 | ||
1088 | get_online_cpus(); | ||
1089 | |||
1090 | arg = (struct migration_swap_arg){ | 1088 | arg = (struct migration_swap_arg){ |
1091 | .src_task = cur, | 1089 | .src_task = cur, |
1092 | .src_cpu = task_cpu(cur), | 1090 | .src_cpu = task_cpu(cur), |
@@ -1097,6 +1095,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1097 | if (arg.src_cpu == arg.dst_cpu) | 1095 | if (arg.src_cpu == arg.dst_cpu) |
1098 | goto out; | 1096 | goto out; |
1099 | 1097 | ||
1098 | /* | ||
1099 | * These three tests are all lockless; this is OK since all of them | ||
1100 | * will be re-checked with proper locks held further down the line. | ||
1101 | */ | ||
1100 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | 1102 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) |
1101 | goto out; | 1103 | goto out; |
1102 | 1104 | ||
@@ -1109,7 +1111,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1109 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | 1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); |
1110 | 1112 | ||
1111 | out: | 1113 | out: |
1112 | put_online_cpus(); | ||
1113 | return ret; | 1114 | return ret; |
1114 | } | 1115 | } |
1115 | 1116 | ||
@@ -3710,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3710 | struct task_struct *p; | 3711 | struct task_struct *p; |
3711 | int retval; | 3712 | int retval; |
3712 | 3713 | ||
3713 | get_online_cpus(); | ||
3714 | rcu_read_lock(); | 3714 | rcu_read_lock(); |
3715 | 3715 | ||
3716 | p = find_process_by_pid(pid); | 3716 | p = find_process_by_pid(pid); |
@@ -3773,7 +3773,6 @@ out_free_cpus_allowed: | |||
3773 | free_cpumask_var(cpus_allowed); | 3773 | free_cpumask_var(cpus_allowed); |
3774 | out_put_task: | 3774 | out_put_task: |
3775 | put_task_struct(p); | 3775 | put_task_struct(p); |
3776 | put_online_cpus(); | ||
3777 | return retval; | 3776 | return retval; |
3778 | } | 3777 | } |
3779 | 3778 | ||
@@ -3818,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
3818 | unsigned long flags; | 3817 | unsigned long flags; |
3819 | int retval; | 3818 | int retval; |
3820 | 3819 | ||
3821 | get_online_cpus(); | ||
3822 | rcu_read_lock(); | 3820 | rcu_read_lock(); |
3823 | 3821 | ||
3824 | retval = -ESRCH; | 3822 | retval = -ESRCH; |
@@ -3831,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
3831 | goto out_unlock; | 3829 | goto out_unlock; |
3832 | 3830 | ||
3833 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 3831 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3834 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 3832 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); |
3835 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3833 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3836 | 3834 | ||
3837 | out_unlock: | 3835 | out_unlock: |
3838 | rcu_read_unlock(); | 3836 | rcu_read_unlock(); |
3839 | put_online_cpus(); | ||
3840 | 3837 | ||
3841 | return retval; | 3838 | return retval; |
3842 | } | 3839 | } |
@@ -6494,14 +6491,17 @@ void __init sched_init_smp(void) | |||
6494 | 6491 | ||
6495 | sched_init_numa(); | 6492 | sched_init_numa(); |
6496 | 6493 | ||
6497 | get_online_cpus(); | 6494 | /* |
6495 | * There's no userspace yet to cause hotplug operations; hence all the | ||
6496 | * cpu masks are stable and all blatant races in the below code cannot | ||
6497 | * happen. | ||
6498 | */ | ||
6498 | mutex_lock(&sched_domains_mutex); | 6499 | mutex_lock(&sched_domains_mutex); |
6499 | init_sched_domains(cpu_active_mask); | 6500 | init_sched_domains(cpu_active_mask); |
6500 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 6501 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
6501 | if (cpumask_empty(non_isolated_cpus)) | 6502 | if (cpumask_empty(non_isolated_cpus)) |
6502 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 6503 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
6503 | mutex_unlock(&sched_domains_mutex); | 6504 | mutex_unlock(&sched_domains_mutex); |
6504 | put_online_cpus(); | ||
6505 | 6505 | ||
6506 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); | 6506 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); |
6507 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6507 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 32a6c44d8f78..c530bc5be7cf 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -234,11 +234,13 @@ static void irq_cpu_stop_queue_work(void *arg) | |||
234 | */ | 234 | */ |
235 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) | 235 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) |
236 | { | 236 | { |
237 | int call_cpu; | ||
238 | struct cpu_stop_done done; | 237 | struct cpu_stop_done done; |
239 | struct cpu_stop_work work1, work2; | 238 | struct cpu_stop_work work1, work2; |
240 | struct irq_cpu_stop_queue_work_info call_args; | 239 | struct irq_cpu_stop_queue_work_info call_args; |
241 | struct multi_stop_data msdata = { | 240 | struct multi_stop_data msdata; |
241 | |||
242 | preempt_disable(); | ||
243 | msdata = (struct multi_stop_data){ | ||
242 | .fn = fn, | 244 | .fn = fn, |
243 | .data = arg, | 245 | .data = arg, |
244 | .num_threads = 2, | 246 | .num_threads = 2, |
@@ -262,16 +264,30 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
262 | set_state(&msdata, MULTI_STOP_PREPARE); | 264 | set_state(&msdata, MULTI_STOP_PREPARE); |
263 | 265 | ||
264 | /* | 266 | /* |
267 | * If we observe both CPUs active we know _cpu_down() cannot yet have | ||
268 | * queued its stop_machine works and therefore ours will get executed | ||
269 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
270 | * in which case we don't care. | ||
271 | * | ||
272 | * This relies on the stopper workqueues to be FIFO. | ||
273 | */ | ||
274 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
275 | preempt_enable(); | ||
276 | return -ENOENT; | ||
277 | } | ||
278 | |||
279 | /* | ||
265 | * Queuing needs to be done by the lowest numbered CPU, to ensure | 280 | * Queuing needs to be done by the lowest numbered CPU, to ensure |
266 | * that works are always queued in the same order on every CPU. | 281 | * that works are always queued in the same order on every CPU. |
267 | * This prevents deadlocks. | 282 | * This prevents deadlocks. |
268 | */ | 283 | */ |
269 | call_cpu = min(cpu1, cpu2); | 284 | smp_call_function_single(min(cpu1, cpu2), |
270 | 285 | &irq_cpu_stop_queue_work, | |
271 | smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work, | ||
272 | &call_args, 0); | 286 | &call_args, 0); |
287 | preempt_enable(); | ||
273 | 288 | ||
274 | wait_for_completion(&done.completion); | 289 | wait_for_completion(&done.completion); |
290 | |||
275 | return done.executed ? done.ret : -ENOENT; | 291 | return done.executed ? done.ret : -ENOENT; |
276 | } | 292 | } |
277 | 293 | ||