sched: Ensure set_task_cpu() is never called on blocked tasks

In order to clean up the set_task_cpu() rq dependencies we need to ensure it is never called on blocked tasks because such usage does not pair with consistent rq->lock usage. This puts the migration burden on ttwu(). Furthermore we need to close a race against changing ->cpus_allowed, since select_task_rq() runs with only preemption disabled. For sched_fork() this is safe because the child isn't in the tasklist yet, for wakeup we fix this by synchronizing set_cpus_allowed_ptr() against TASK_WAKING, which leaves sched_exec to be a problem This also closes a hole in (6ad4c1888 sched: Fix balance vs hotplug race) where ->select_task_rq() doesn't validate the result against the sched_domain/root_domain. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2009-12-16 12:04:36 -0500
committer: Ingo Molnar <mingo@elte.hu> 2009-12-16 13:01:56 -0500
commit: e2912009fb7b715728311b0d8fe327a1432b3f79 (patch)
tree: 9918e2f9690d85b7d7f5550d09f0ae6cc3fa4e0f /kernel
parent: 06b83b5fbea273672822b6ee93e16781046553ec (diff)
1 files changed, 66 insertions, 19 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1672823aabfe..33d7965f63f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
        /* Must have done schedule() in kthread() before we set_task_cpu */
        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
                WARN_ON(1);
                return;
        }
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        update_rq_clock(rq);
-        set_task_cpu(p, cpu);
        p->cpus_allowed = cpumask_of_cpu(cpu);
        p->rt.nr_cpus_allowed = 1;
        p->flags |= PF_THREAD_BOUND;
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 EXPORT_SYMBOL(kthread_bind);
@@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+#ifdef CONFIG_SCHED_DEBUG
+        /*
+         * We should never call set_task_cpu() on a blocked task,
+         * ttwu() will sort out the placement.
+         */
+        WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING);
+#endif
        trace_sched_migrate_task(p, new_cpu);
        if (old_cpu != new_cpu) {
@@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
        /*
         * If the task is not on a runqueue (and not running), then
-         * it is sufficient to simply update the task's cpu field.
+         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p)) {
+        if (!p->se.on_rq && !task_running(rq, p))
-                update_rq_clock(rq);
-                set_task_cpu(p, dest_cpu);
                return 0;
-        }
        init_completion(&req->done);
        req->task = p;
@@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 #ifdef CONFIG_SMP
+/*
+ * Called from:
+ *
+ *  - fork, @p is stable because it isn't on the tasklist yet
+ *
+ *  - exec, @p is unstable XXX
+ *
+ *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
+ *             we should be good.
+ */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-        return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        /*
+         * In order not to call set_task_cpu() on a blocking task we need
+         * to rely on ttwu() to place the task on a valid ->cpus_allowed
+         * cpu.
+         *
+         * Since this is common to all placement strategies, this lives here.
+         *
+         * [ this allows ->select_task() to simply return task_cpu(p) and
+         *   not worry about this generic constraint ]
+         */
+        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+                     !cpu_active(cpu))) {
+                cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+                /*
+                 * XXX: race against hot-plug modifying cpu_active_mask
+                 */
+                BUG_ON(cpu >= nr_cpu_ids);
+        }
+        return cpu;
 }
 #endif
@@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
+        /*
+         * Since we rely on wake-ups to migrate sleeping tasks, don't change
+         * the ->cpus_allowed mask from under waking tasks, which would be
+         * possible when we change rq->lock in ttwu(), so synchronize against
+         * TASK_WAKING to avoid that.
+         */
+again:
+        while (p->state == TASK_WAKING)
+                cpu_relax();
        rq = task_rq_lock(p, &flags);
+        if (p->state == TASK_WAKING) {
+                task_rq_unlock(rq, &flags);
+                goto again;
+        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
@@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        struct rq *rq_dest, *rq_src;
-        int ret = 0, on_rq;
+        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
@@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
-        on_rq = p->se.on_rq;
+        /*
-        if (on_rq)
+         * If we're not on a rq, the next wake-up will ensure we're
+         * placed properly.
+         */
+        if (p->se.on_rq) {
                deactivate_task(rq_src, p, 0);
+                set_task_cpu(p, dest_cpu);
-        set_task_cpu(p, dest_cpu);
-        if (on_rq) {
                activate_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p, 0);
        }
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2009-12-16 12:04:36 -0500
committer	Ingo Molnar <mingo@elte.hu>	2009-12-16 13:01:56 -0500
commit	e2912009fb7b715728311b0d8fe327a1432b3f79 (patch)
tree	9918e2f9690d85b7d7f5550d09f0ae6cc3fa4e0f /kernel
parent	06b83b5fbea273672822b6ee93e16781046553ec (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 1672823aabfe..33d7965f63f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq rq, struct task_struct p,
2018	*/	2018	*/
2019	void kthread_bind(struct task_struct *p, unsigned int cpu)	2019	void kthread_bind(struct task_struct *p, unsigned int cpu)
2020	{	2020	{
2021	struct rq *rq = cpu_rq(cpu);
2022	unsigned long flags;
2023
2024	/* Must have done schedule() in kthread() before we set_task_cpu */	2021	/* Must have done schedule() in kthread() before we set_task_cpu */
2025	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {	2022	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2026	WARN_ON(1);	2023	WARN_ON(1);
2027	return;	2024	return;
2028	}	2025	}
2029		2026
2030	raw_spin_lock_irqsave(&rq->lock, flags);
2031	update_rq_clock(rq);
2032	set_task_cpu(p, cpu);
2033	p->cpus_allowed = cpumask_of_cpu(cpu);	2027	p->cpus_allowed = cpumask_of_cpu(cpu);
2034	p->rt.nr_cpus_allowed = 1;	2028	p->rt.nr_cpus_allowed = 1;
2035	p->flags \|= PF_THREAD_BOUND;	2029	p->flags \|= PF_THREAD_BOUND;
2036	raw_spin_unlock_irqrestore(&rq->lock, flags);
2037	}	2030	}
2038	EXPORT_SYMBOL(kthread_bind);	2031	EXPORT_SYMBOL(kthread_bind);
2039		2032
@@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2074	struct cfs_rq *old_cfsrq = task_cfs_rq(p),	2067	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2075	*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);	2068	*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2076		2069
		2070	#ifdef CONFIG_SCHED_DEBUG
		2071	/*
		2072	* We should never call set_task_cpu() on a blocked task,
		2073	* ttwu() will sort out the placement.
		2074	*/
		2075	WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING);
		2076	#endif
		2077
2077	trace_sched_migrate_task(p, new_cpu);	2078	trace_sched_migrate_task(p, new_cpu);
2078		2079
2079	if (old_cpu != new_cpu) {	2080	if (old_cpu != new_cpu) {
@@ -2107,13 +2108,10 @@ migrate_task(struct task_struct p, int dest_cpu, struct migration_req req)
2107		2108
2108	/*	2109	/*
2109	* If the task is not on a runqueue (and not running), then	2110	* If the task is not on a runqueue (and not running), then
2110	* it is sufficient to simply update the task's cpu field.	2111	* the next wake-up will properly place the task.
2111	*/	2112	*/
2112	if (!p->se.on_rq && !task_running(rq, p)) {	2113	if (!p->se.on_rq && !task_running(rq, p))
2113	update_rq_clock(rq);
2114	set_task_cpu(p, dest_cpu);
2115	return 0;	2114	return 0;
2116	}
2117		2115
2118	init_completion(&req->done);	2116	init_completion(&req->done);
2119	req->task = p;	2117	req->task = p;
@@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p,
2319	}	2317	}
2320		2318
2321	#ifdef CONFIG_SMP	2319	#ifdef CONFIG_SMP
		2320	/*
		2321	* Called from:
		2322	*
		2323	* - fork, @p is stable because it isn't on the tasklist yet
		2324	*
		2325	* - exec, @p is unstable XXX
		2326	*
		2327	* - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
		2328	* we should be good.
		2329	*/
2322	static inline	2330	static inline
2323	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)	2331	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2324	{	2332	{
2325	return p->sched_class->select_task_rq(p, sd_flags, wake_flags);	2333	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
		2334
		2335	/*
		2336	* In order not to call set_task_cpu() on a blocking task we need
		2337	* to rely on ttwu() to place the task on a valid ->cpus_allowed
		2338	* cpu.
		2339	*
		2340	* Since this is common to all placement strategies, this lives here.
		2341	*
		2342	* [ this allows ->select_task() to simply return task_cpu(p) and
		2343	* not worry about this generic constraint ]
		2344	*/
		2345	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) \|\|
		2346	!cpu_active(cpu))) {
		2347
		2348	cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
		2349	/*
		2350	* XXX: race against hot-plug modifying cpu_active_mask
		2351	*/
		2352	BUG_ON(cpu >= nr_cpu_ids);
		2353	}
		2354
		2355	return cpu;
2326	}	2356	}
2327	#endif	2357	#endif
2328		2358
@@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
7098	struct rq *rq;	7128	struct rq *rq;
7099	int ret = 0;	7129	int ret = 0;
7100		7130
		7131	/*
		7132	* Since we rely on wake-ups to migrate sleeping tasks, don't change
		7133	* the ->cpus_allowed mask from under waking tasks, which would be
		7134	* possible when we change rq->lock in ttwu(), so synchronize against
		7135	* TASK_WAKING to avoid that.
		7136	*/
		7137	again:
		7138	while (p->state == TASK_WAKING)
		7139	cpu_relax();
		7140
7101	rq = task_rq_lock(p, &flags);	7141	rq = task_rq_lock(p, &flags);
		7142
		7143	if (p->state == TASK_WAKING) {
		7144	task_rq_unlock(rq, &flags);
		7145	goto again;
		7146	}
		7147
7102	if (!cpumask_intersects(new_mask, cpu_active_mask)) {	7148	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7103	ret = -EINVAL;	7149	ret = -EINVAL;
7104	goto out;	7150	goto out;
@@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7154	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)	7200	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7155	{	7201	{
7156	struct rq rq_dest, rq_src;	7202	struct rq rq_dest, rq_src;
7157	int ret = 0, on_rq;	7203	int ret = 0;
7158		7204
7159	if (unlikely(!cpu_active(dest_cpu)))	7205	if (unlikely(!cpu_active(dest_cpu)))
7160	return ret;	7206	return ret;
@@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7170	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))	7216	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7171	goto fail;	7217	goto fail;
7172		7218
7173	on_rq = p->se.on_rq;	7219	/*
7174	if (on_rq)	7220	* If we're not on a rq, the next wake-up will ensure we're
		7221	* placed properly.
		7222	*/
		7223	if (p->se.on_rq) {
7175	deactivate_task(rq_src, p, 0);	7224	deactivate_task(rq_src, p, 0);
7176		7225	set_task_cpu(p, dest_cpu);
7177	set_task_cpu(p, dest_cpu);
7178	if (on_rq) {
7179	activate_task(rq_dest, p, 0);	7226	activate_task(rq_dest, p, 0);
7180	check_preempt_curr(rq_dest, p, 0);	7227	check_preempt_curr(rq_dest, p, 0);
7181	}	7228	}