sched: Remove get_online_cpus() usage

Remove get_online_cpus() usage from the scheduler; there's 4 sites that use it: - sched_init_smp(); where its completely superfluous since we're in 'early' boot and there simply cannot be any hotplugging. - sched_getaffinity(); we already take a raw spinlock to protect the task cpus_allowed mask, this disables preemption and therefore also stabilizes cpu_online_mask as that's modified using stop_machine. However switch to active mask for symmetry with sched_setaffinity()/set_cpus_allowed_ptr(). We guarantee active mask stability by inserting sync_rcu/sched() into _cpu_down. - sched_setaffinity(); we don't appear to need get_online_cpus() either, there's two sites where hotplug appears relevant: * cpuset_cpus_allowed(); for the !cpuset case we use possible_mask, for the cpuset case we hold task_lock, which is a spinlock and thus for mainline disables preemption (might cause pain on RT). * set_cpus_allowed_ptr(); Holds all scheduler locks and thus has preemption properly disabled; also it already deals with hotplug races explicitly where it releases them. - migrate_swap(); we can make stop_two_cpus() do the heavy lifting for us with a little trickery. By adding a sync_sched/rcu() after the CPU_DOWN_PREPARE notifier we can provide preempt/rcu guarantees for cpu_active_mask. Use these to validate that both our cpus are active when queueing the stop work before we queue the stop_machine works for take_cpu_down(). Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Oleg Nesterov <oleg@redhat.com> Link: http://lkml.kernel.org/r/20131011123820.GV3081@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2013-10-11 08:38:20 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-16 08:22:16 -0400
commit: 6acce3ef84520537f8a09a12c9ddbe814a584dd2 (patch)
tree: b4e117df4a57be6a040529c148480227c3d100cc /kernel
parent: 746023159c40c523b08a3bc3d213dac212385895 (diff)
3 files changed, 48 insertions, 15 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        smpboot_park_threads(cpu);
+        /*
+         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+         * and RCU users of this state to go away such that all new such users
+         * will observe it.
+         *
+         * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+         * not imply sync_sched(), so explicitly call both.
+         */
+#ifdef CONFIG_PREEMPT
+        synchronize_sched();
+#endif
+        synchronize_rcu();
+        /*
+         * So now all preempt/rcu users must observe !cpu_active().
+         */
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a972acd468b0..c06b8d345fae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1085,8 +1085,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
        struct migration_swap_arg arg;
        int ret = -EINVAL;
-        get_online_cpus();
        arg = (struct migration_swap_arg){
                .src_task = cur,
                .src_cpu = task_cpu(cur),
@@ -1097,6 +1095,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
        if (arg.src_cpu == arg.dst_cpu)
                goto out;
+        /*
+         * These three tests are all lockless; this is OK since all of them
+         * will be re-checked with proper locks held further down the line.
+         */
        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
                goto out;
@@ -1109,7 +1111,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 out:
-        put_online_cpus();
        return ret;
 }
@@ -3710,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        struct task_struct *p;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        p = find_process_by_pid(pid);
@@ -3773,7 +3773,6 @@ out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
 out_put_task:
        put_task_struct(p);
-        put_online_cpus();
        return retval;
 }
@@ -3818,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        unsigned long flags;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        retval = -ESRCH;
@@ -3831,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
-        put_online_cpus();
        return retval;
 }
@@ -6494,14 +6491,17 @@ void __init sched_init_smp(void)
        sched_init_numa();
-        get_online_cpus();
+        /*
+         * There's no userspace yet to cause hotplug operations; hence all the
+         * cpu masks are stable and all blatant races in the below code cannot
+         * happen.
+         */
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
-        put_online_cpus();
        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 32a6c44d8f78..c530bc5be7cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -234,11 +234,13 @@ static void irq_cpu_stop_queue_work(void *arg)
 */
 int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
 {
-        int call_cpu;
        struct cpu_stop_done done;
        struct cpu_stop_work work1, work2;
        struct irq_cpu_stop_queue_work_info call_args;
-        struct multi_stop_data msdata = {
+        struct multi_stop_data msdata;
+        preempt_disable();
+        msdata = (struct multi_stop_data){
                .fn = fn,
                .data = arg,
                .num_threads = 2,
@@ -262,16 +264,30 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
        set_state(&msdata, MULTI_STOP_PREPARE);
        /*
+         * If we observe both CPUs active we know _cpu_down() cannot yet have
+         * queued its stop_machine works and therefore ours will get executed
+         * first. Or its not either one of our CPUs that's getting unplugged,
+         * in which case we don't care.
+         *
+         * This relies on the stopper workqueues to be FIFO.
+         */
+        if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+                preempt_enable();
+                return -ENOENT;
+        }
+        /*
         * Queuing needs to be done by the lowest numbered CPU, to ensure
         * that works are always queued in the same order on every CPU.
         * This prevents deadlocks.
         */
-        call_cpu = min(cpu1, cpu2);
+        smp_call_function_single(min(cpu1, cpu2),
+                                 &irq_cpu_stop_queue_work,
-        smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work,
                                 &call_args, 0);
+        preempt_enable();
        wait_for_completion(&done.completion);
        return done.executed ? done.ret : -ENOENT;
 }
author	Peter Zijlstra <peterz@infradead.org>	2013-10-11 08:38:20 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-16 08:22:16 -0400
commit	6acce3ef84520537f8a09a12c9ddbe814a584dd2 (patch)
tree	b4e117df4a57be6a040529c148480227c3d100cc /kernel
parent	746023159c40c523b08a3bc3d213dac212385895 (diff)

diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..63aa50d7ce1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
308	}	308	}
309	smpboot_park_threads(cpu);	309	smpboot_park_threads(cpu);
310		310
		311	/*
		312	* By now we've cleared cpu_active_mask, wait for all preempt-disabled
		313	* and RCU users of this state to go away such that all new such users
		314	* will observe it.
		315	*
		316	* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
		317	* not imply sync_sched(), so explicitly call both.
		318	*/
		319	#ifdef CONFIG_PREEMPT
		320	synchronize_sched();
		321	#endif
		322	synchronize_rcu();
		323
		324	/*
		325	* So now all preempt/rcu users must observe !cpu_active().
		326	*/
		327
311	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));	328	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
312	if (err) {	329	if (err) {
313	/* CPU didn't die: tell everyone. Can't complain. */	330	/* CPU didn't die: tell everyone. Can't complain. */


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a972acd468b0..c06b8d345fae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -1085,8 +1085,6 @@ int migrate_swap(struct task_struct cur, struct task_struct p)
1085	struct migration_swap_arg arg;	1085	struct migration_swap_arg arg;
1086	int ret = -EINVAL;	1086	int ret = -EINVAL;
1087		1087
1088	get_online_cpus();
1089
1090	arg = (struct migration_swap_arg){	1088	arg = (struct migration_swap_arg){
1091	.src_task = cur,	1089	.src_task = cur,
1092	.src_cpu = task_cpu(cur),	1090	.src_cpu = task_cpu(cur),
@@ -1097,6 +1095,10 @@ int migrate_swap(struct task_struct cur, struct task_struct p)
1097	if (arg.src_cpu == arg.dst_cpu)	1095	if (arg.src_cpu == arg.dst_cpu)
1098	goto out;	1096	goto out;
1099		1097
		1098	/*
		1099	* These three tests are all lockless; this is OK since all of them
		1100	* will be re-checked with proper locks held further down the line.
		1101	*/
1100	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))	1102	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))
1101	goto out;	1103	goto out;
1102		1104
@@ -1109,7 +1111,6 @@ int migrate_swap(struct task_struct cur, struct task_struct p)
1109	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);	1111	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1110		1112
1111	out:	1113	out:
1112	put_online_cpus();
1113	return ret;	1114	return ret;
1114	}	1115	}
1115		1116
@@ -3710,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3710	struct task_struct *p;	3711	struct task_struct *p;
3711	int retval;	3712	int retval;
3712		3713
3713	get_online_cpus();
3714	rcu_read_lock();	3714	rcu_read_lock();
3715		3715
3716	p = find_process_by_pid(pid);	3716	p = find_process_by_pid(pid);
@@ -3773,7 +3773,6 @@ out_free_cpus_allowed:
3773	free_cpumask_var(cpus_allowed);	3773	free_cpumask_var(cpus_allowed);
3774	out_put_task:	3774	out_put_task:
3775	put_task_struct(p);	3775	put_task_struct(p);
3776	put_online_cpus();
3777	return retval;	3776	return retval;
3778	}	3777	}
3779		3778
@@ -3818,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3818	unsigned long flags;	3817	unsigned long flags;
3819	int retval;	3818	int retval;
3820		3819
3821	get_online_cpus();
3822	rcu_read_lock();	3820	rcu_read_lock();
3823		3821
3824	retval = -ESRCH;	3822	retval = -ESRCH;
@@ -3831,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3831	goto out_unlock;	3829	goto out_unlock;
3832		3830
3833	raw_spin_lock_irqsave(&p->pi_lock, flags);	3831	raw_spin_lock_irqsave(&p->pi_lock, flags);
3834	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);	3832	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3835	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	3833	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3836		3834
3837	out_unlock:	3835	out_unlock:
3838	rcu_read_unlock();	3836	rcu_read_unlock();
3839	put_online_cpus();
3840		3837
3841	return retval;	3838	return retval;
3842	}	3839	}
@@ -6494,14 +6491,17 @@ void __init sched_init_smp(void)
6494		6491
6495	sched_init_numa();	6492	sched_init_numa();
6496		6493
6497	get_online_cpus();	6494	/*
		6495	* There's no userspace yet to cause hotplug operations; hence all the
		6496	* cpu masks are stable and all blatant races in the below code cannot
		6497	* happen.
		6498	*/
6498	mutex_lock(&sched_domains_mutex);	6499	mutex_lock(&sched_domains_mutex);
6499	init_sched_domains(cpu_active_mask);	6500	init_sched_domains(cpu_active_mask);
6500	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);	6501	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6501	if (cpumask_empty(non_isolated_cpus))	6502	if (cpumask_empty(non_isolated_cpus))
6502	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);	6503	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6503	mutex_unlock(&sched_domains_mutex);	6504	mutex_unlock(&sched_domains_mutex);
6504	put_online_cpus();
6505		6505
6506	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);	6506	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6507	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);	6507	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);


diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 32a6c44d8f78..c530bc5be7cf 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c
@@ -234,11 +234,13 @@ static void irq_cpu_stop_queue_work(void *arg)
234	*/	234	*/
235	int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)	235	int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
236	{	236	{
237	int call_cpu;
238	struct cpu_stop_done done;	237	struct cpu_stop_done done;
239	struct cpu_stop_work work1, work2;	238	struct cpu_stop_work work1, work2;
240	struct irq_cpu_stop_queue_work_info call_args;	239	struct irq_cpu_stop_queue_work_info call_args;
241	struct multi_stop_data msdata = {	240	struct multi_stop_data msdata;
		241
		242	preempt_disable();
		243	msdata = (struct multi_stop_data){
242	.fn = fn,	244	.fn = fn,
243	.data = arg,	245	.data = arg,
244	.num_threads = 2,	246	.num_threads = 2,
@@ -262,16 +264,30 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
262	set_state(&msdata, MULTI_STOP_PREPARE);	264	set_state(&msdata, MULTI_STOP_PREPARE);
263		265
264	/*	266	/*
		267	* If we observe both CPUs active we know _cpu_down() cannot yet have
		268	* queued its stop_machine works and therefore ours will get executed
		269	* first. Or its not either one of our CPUs that's getting unplugged,
		270	* in which case we don't care.
		271	*
		272	* This relies on the stopper workqueues to be FIFO.
		273	*/
		274	if (!cpu_active(cpu1) \|\| !cpu_active(cpu2)) {
		275	preempt_enable();
		276	return -ENOENT;
		277	}
		278
		279	/*
265	* Queuing needs to be done by the lowest numbered CPU, to ensure	280	* Queuing needs to be done by the lowest numbered CPU, to ensure
266	* that works are always queued in the same order on every CPU.	281	* that works are always queued in the same order on every CPU.
267	* This prevents deadlocks.	282	* This prevents deadlocks.
268	*/	283	*/
269	call_cpu = min(cpu1, cpu2);	284	smp_call_function_single(min(cpu1, cpu2),
270		285	&irq_cpu_stop_queue_work,
271	smp_call_function_single(call_cpu, &irq_cpu_stop_queue_work,
272	&call_args, 0);	286	&call_args, 0);
		287	preempt_enable();
273		288
274	wait_for_completion(&done.completion);	289	wait_for_completion(&done.completion);
		290
275	return done.executed ? done.ret : -ENOENT;	291	return done.executed ? done.ret : -ENOENT;
276	}	292	}
277		293