sched: Make select_fallback_rq() cpuset friendly

Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems with select_fallback_rq(). It can be called from any context and can't use any cpuset locks including task_lock(). It is called when the task doesn't have online cpus in ->cpus_allowed but ttwu/etc must be able to find a suitable cpu. I am not proud of this patch. Everything which needs such a fat comment can't be good even if correct. But I'd prefer to not change the locking rules in the code I hardly understand, and in any case I believe this simple change make the code much more correct compared to deadlocks we currently have. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20100315091027.GA9155@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Oleg Nesterov <oleg@redhat.com> 2010-03-15 05:10:27 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-04-02 14:12:03 -0400
commit: 9084bb8246ea935b98320554229e2f371f7f52fa (patch)
tree: 8478d18125e3b4a7e0a31d702647dee1830d23ef /kernel
parent: 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 (diff)
2 files changed, 43 insertions, 3 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a747f56d58c..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2188,6 +2188,48 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
        mutex_unlock(&callback_mutex);
 }
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+        const struct cpuset *cs;
+        int cpu;
+        rcu_read_lock();
+        cs = task_cs(tsk);
+        if (cs)
+                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+        rcu_read_unlock();
+        /*
+         * We own tsk->cpus_allowed, nobody can change it under us.
+         *
+         * But we used cs && cs->cpus_allowed lockless and thus can
+         * race with cgroup_attach_task() or update_cpumask() and get
+         * the wrong tsk->cpus_allowed. However, both cases imply the
+         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+         * which takes task_rq_lock().
+         *
+         * If we are called after it dropped the lock we must see all
+         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+         * set any mask even if it is not right from task_cs() pov,
+         * the pending set_cpus_allowed_ptr() will fix things.
+         */
+        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+        if (cpu >= nr_cpu_ids) {
+                /*
+                 * Either tsk->cpus_allowed is wrong (see above) or it
+                 * is actually empty. The latter case is only possible
+                 * if we are racing with remove_tasks_in_empty_cpuset().
+                 * Like above we can temporary set any mask and rely on
+                 * set_cpus_allowed_ptr() as synchronization point.
+                 */
+                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                cpu = cpumask_any(cpu_active_mask);
+        }
+        return cpu;
+}
 void cpuset_init_current_mems_allowed(void)
 {
        nodes_setall(current->mems_allowed);
diff --git a/kernel/sched.c b/kernel/sched.c
index 11119deffa48..9a38c7a24ed7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2300,9 +2300,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        /* No more Mr. Nice Guy. */
        if (unlikely(dest_cpu >= nr_cpu_ids)) {
-                cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+                dest_cpu = cpuset_cpus_allowed_fallback(p);
-                dest_cpu = cpumask_any(cpu_active_mask);
                /*
                 * Don't tell them about moving exiting tasks or
                 * kernel threads (both mm NULL), since they never
author	Oleg Nesterov <oleg@redhat.com>	2010-03-15 05:10:27 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-04-02 14:12:03 -0400
commit	9084bb8246ea935b98320554229e2f371f7f52fa (patch)
tree	8478d18125e3b4a7e0a31d702647dee1830d23ef /kernel
parent	6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 (diff)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a747f56d58c..9a50c5f6e727 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -2188,6 +2188,48 @@ void cpuset_cpus_allowed(struct task_struct tsk, struct cpumask pmask)
2188	mutex_unlock(&callback_mutex);	2188	mutex_unlock(&callback_mutex);
2189	}	2189	}
2190		2190
		2191	int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
		2192	{
		2193	const struct cpuset *cs;
		2194	int cpu;
		2195
		2196	rcu_read_lock();
		2197	cs = task_cs(tsk);
		2198	if (cs)
		2199	cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
		2200	rcu_read_unlock();
		2201
		2202	/*
		2203	* We own tsk->cpus_allowed, nobody can change it under us.
		2204	*
		2205	* But we used cs && cs->cpus_allowed lockless and thus can
		2206	* race with cgroup_attach_task() or update_cpumask() and get
		2207	* the wrong tsk->cpus_allowed. However, both cases imply the
		2208	* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
		2209	* which takes task_rq_lock().
		2210	*
		2211	* If we are called after it dropped the lock we must see all
		2212	* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
		2213	* set any mask even if it is not right from task_cs() pov,
		2214	* the pending set_cpus_allowed_ptr() will fix things.
		2215	*/
		2216
		2217	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
		2218	if (cpu >= nr_cpu_ids) {
		2219	/*
		2220	* Either tsk->cpus_allowed is wrong (see above) or it
		2221	* is actually empty. The latter case is only possible
		2222	* if we are racing with remove_tasks_in_empty_cpuset().
		2223	* Like above we can temporary set any mask and rely on
		2224	* set_cpus_allowed_ptr() as synchronization point.
		2225	*/
		2226	cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
		2227	cpu = cpumask_any(cpu_active_mask);
		2228	}
		2229
		2230	return cpu;
		2231	}
		2232
2191	void cpuset_init_current_mems_allowed(void)	2233	void cpuset_init_current_mems_allowed(void)
2192	{	2234	{
2193	nodes_setall(current->mems_allowed);	2235	nodes_setall(current->mems_allowed);


diff --git a/kernel/sched.c b/kernel/sched.c index 11119deffa48..9a38c7a24ed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -2300,9 +2300,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2300		2300
2301	/* No more Mr. Nice Guy. */	2301	/* No more Mr. Nice Guy. */
2302	if (unlikely(dest_cpu >= nr_cpu_ids)) {	2302	if (unlikely(dest_cpu >= nr_cpu_ids)) {
2303	cpumask_copy(&p->cpus_allowed, cpu_possible_mask);	2303	dest_cpu = cpuset_cpus_allowed_fallback(p);
2304	dest_cpu = cpumask_any(cpu_active_mask);
2305
2306	/*	2304	/*
2307	* Don't tell them about moving exiting tasks or	2305	* Don't tell them about moving exiting tasks or
2308	* kernel threads (both mm NULL), since they never	2306	* kernel threads (both mm NULL), since they never