diff options
author | Oleg Nesterov <oleg@redhat.com> | 2010-03-15 05:10:27 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-04-02 14:12:03 -0400 |
commit | 9084bb8246ea935b98320554229e2f371f7f52fa (patch) | |
tree | 8478d18125e3b4a7e0a31d702647dee1830d23ef | |
parent | 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 (diff) |
sched: Make select_fallback_rq() cpuset friendly
Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems
with select_fallback_rq(). It can be called from any context and can't use
any cpuset locks including task_lock(). It is called when the task doesn't
have online cpus in ->cpus_allowed but ttwu/etc must be able to find a
suitable cpu.
I am not proud of this patch. Everything which needs such a fat comment
can't be good even if correct. But I'd prefer to not change the locking
rules in the code I hardly understand, and in any case I believe this
simple change make the code much more correct compared to deadlocks we
currently have.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091027.GA9155@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/cpuset.h | 7 | ||||
-rw-r--r-- | kernel/cpuset.c | 42 | ||||
-rw-r--r-- | kernel/sched.c | 4 |
3 files changed, 50 insertions, 3 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index eeaaee746bee..a73454aec333 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -21,6 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ | |||
21 | extern int cpuset_init(void); | 21 | extern int cpuset_init(void); |
22 | extern void cpuset_init_smp(void); | 22 | extern void cpuset_init_smp(void); |
23 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); | 23 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); |
24 | extern int cpuset_cpus_allowed_fallback(struct task_struct *p); | ||
24 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
25 | #define cpuset_current_mems_allowed (current->mems_allowed) | 26 | #define cpuset_current_mems_allowed (current->mems_allowed) |
26 | void cpuset_init_current_mems_allowed(void); | 27 | void cpuset_init_current_mems_allowed(void); |
@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, | |||
101 | cpumask_copy(mask, cpu_possible_mask); | 102 | cpumask_copy(mask, cpu_possible_mask); |
102 | } | 103 | } |
103 | 104 | ||
105 | static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) | ||
106 | { | ||
107 | cpumask_copy(&p->cpus_allowed, cpu_possible_mask); | ||
108 | return cpumask_any(cpu_active_mask); | ||
109 | } | ||
110 | |||
104 | static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | 111 | static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) |
105 | { | 112 | { |
106 | return node_possible_map; | 113 | return node_possible_map; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a747f56d58c..9a50c5f6e727 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -2188,6 +2188,48 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
2188 | mutex_unlock(&callback_mutex); | 2188 | mutex_unlock(&callback_mutex); |
2189 | } | 2189 | } |
2190 | 2190 | ||
2191 | int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | ||
2192 | { | ||
2193 | const struct cpuset *cs; | ||
2194 | int cpu; | ||
2195 | |||
2196 | rcu_read_lock(); | ||
2197 | cs = task_cs(tsk); | ||
2198 | if (cs) | ||
2199 | cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); | ||
2200 | rcu_read_unlock(); | ||
2201 | |||
2202 | /* | ||
2203 | * We own tsk->cpus_allowed, nobody can change it under us. | ||
2204 | * | ||
2205 | * But we used cs && cs->cpus_allowed lockless and thus can | ||
2206 | * race with cgroup_attach_task() or update_cpumask() and get | ||
2207 | * the wrong tsk->cpus_allowed. However, both cases imply the | ||
2208 | * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() | ||
2209 | * which takes task_rq_lock(). | ||
2210 | * | ||
2211 | * If we are called after it dropped the lock we must see all | ||
2212 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary | ||
2213 | * set any mask even if it is not right from task_cs() pov, | ||
2214 | * the pending set_cpus_allowed_ptr() will fix things. | ||
2215 | */ | ||
2216 | |||
2217 | cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); | ||
2218 | if (cpu >= nr_cpu_ids) { | ||
2219 | /* | ||
2220 | * Either tsk->cpus_allowed is wrong (see above) or it | ||
2221 | * is actually empty. The latter case is only possible | ||
2222 | * if we are racing with remove_tasks_in_empty_cpuset(). | ||
2223 | * Like above we can temporary set any mask and rely on | ||
2224 | * set_cpus_allowed_ptr() as synchronization point. | ||
2225 | */ | ||
2226 | cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); | ||
2227 | cpu = cpumask_any(cpu_active_mask); | ||
2228 | } | ||
2229 | |||
2230 | return cpu; | ||
2231 | } | ||
2232 | |||
2191 | void cpuset_init_current_mems_allowed(void) | 2233 | void cpuset_init_current_mems_allowed(void) |
2192 | { | 2234 | { |
2193 | nodes_setall(current->mems_allowed); | 2235 | nodes_setall(current->mems_allowed); |
diff --git a/kernel/sched.c b/kernel/sched.c index 11119deffa48..9a38c7a24ed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2300,9 +2300,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2300 | 2300 | ||
2301 | /* No more Mr. Nice Guy. */ | 2301 | /* No more Mr. Nice Guy. */ |
2302 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2302 | if (unlikely(dest_cpu >= nr_cpu_ids)) { |
2303 | cpumask_copy(&p->cpus_allowed, cpu_possible_mask); | 2303 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2304 | dest_cpu = cpumask_any(cpu_active_mask); | ||
2305 | |||
2306 | /* | 2304 | /* |
2307 | * Don't tell them about moving exiting tasks or | 2305 | * Don't tell them about moving exiting tasks or |
2308 | * kernel threads (both mm NULL), since they never | 2306 | * kernel threads (both mm NULL), since they never |