diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-01-21 15:04:57 -0500 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2010-01-21 17:25:31 -0500 |
commit | fabf318e5e4bda0aca2b0d617b191884fda62703 (patch) | |
tree | 651b2ee4fb8f393d2fe93f133a5ec6129cb7a8e8 | |
parent | 6d558c3ac9b6508d26fd5cadccce51fc9d726b1c (diff) |
sched: Fix fork vs hotplug vs cpuset namespaces
There are a number of issues:
1) TASK_WAKING vs cgroup_clone (cpusets)
copy_process():
sched_fork()
child->state = TASK_WAKING; /* waiting for wake_up_new_task() */
if (current->nsproxy != p->nsproxy)
ns_cgroup_clone()
cgroup_clone()
mutex_lock(inode->i_mutex)
mutex_lock(cgroup_mutex)
cgroup_attach_task()
ss->can_attach()
ss->attach() [ -> cpuset_attach() ]
cpuset_attach_task()
set_cpus_allowed_ptr();
while (child->state == TASK_WAKING)
cpu_relax();
will deadlock the system.
2) cgroup_clone (cpusets) vs copy_process
So even if the above would work we still have:
copy_process():
if (current->nsproxy != p->nsproxy)
ns_cgroup_clone()
cgroup_clone()
mutex_lock(inode->i_mutex)
mutex_lock(cgroup_mutex)
cgroup_attach_task()
ss->can_attach()
ss->attach() [ -> cpuset_attach() ]
cpuset_attach_task()
set_cpus_allowed_ptr();
...
p->cpus_allowed = current->cpus_allowed
over-writing the modified cpus_allowed.
3) fork() vs hotplug
if we unplug the child's cpu after the sanity check when the child
gets attached to the task_list but before wake_up_new_task() shit
will meet with fan.
Solve all these issues by moving fork cpu selection into
wake_up_new_task().
Reported-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1264106190.4283.1314.camel@laptop>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r-- | kernel/fork.c | 15 | ||||
-rw-r--r-- | kernel/sched.c | 39 |
2 files changed, 27 insertions, 27 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..f88bd984df35 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1241 | /* Need tasklist lock for parent etc handling! */ | 1241 | /* Need tasklist lock for parent etc handling! */ |
1242 | write_lock_irq(&tasklist_lock); | 1242 | write_lock_irq(&tasklist_lock); |
1243 | 1243 | ||
1244 | /* | ||
1245 | * The task hasn't been attached yet, so its cpus_allowed mask will | ||
1246 | * not be changed, nor will its assigned CPU. | ||
1247 | * | ||
1248 | * The cpus_allowed mask of the parent may have changed after it was | ||
1249 | * copied first time - so re-copy it here, then check the child's CPU | ||
1250 | * to ensure it is on a valid CPU (and if not, just force it back to | ||
1251 | * parent's CPU). This avoids alot of nasty races. | ||
1252 | */ | ||
1253 | p->cpus_allowed = current->cpus_allowed; | ||
1254 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1255 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | ||
1256 | !cpu_online(task_cpu(p)))) | ||
1257 | set_task_cpu(p, smp_processor_id()); | ||
1258 | |||
1259 | /* CLONE_PARENT re-uses the old parent */ | 1244 | /* CLONE_PARENT re-uses the old parent */ |
1260 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { | 1245 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { |
1261 | p->real_parent = current->real_parent; | 1246 | p->real_parent = current->real_parent; |
diff --git a/kernel/sched.c b/kernel/sched.c index 4508fe7048be..3a8fb30a91b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2320 | } | 2320 | } |
2321 | 2321 | ||
2322 | /* | 2322 | /* |
2323 | * Called from: | 2323 | * Gets called from 3 sites (exec, fork, wakeup), since it is called without |
2324 | * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done | ||
2325 | * by: | ||
2324 | * | 2326 | * |
2325 | * - fork, @p is stable because it isn't on the tasklist yet | 2327 | * exec: is unstable, retry loop |
2326 | * | 2328 | * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING |
2327 | * - exec, @p is unstable, retry loop | ||
2328 | * | ||
2329 | * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so | ||
2330 | * we should be good. | ||
2331 | */ | 2329 | */ |
2332 | static inline | 2330 | static inline |
2333 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 2331 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2620 | if (p->sched_class->task_fork) | 2618 | if (p->sched_class->task_fork) |
2621 | p->sched_class->task_fork(p); | 2619 | p->sched_class->task_fork(p); |
2622 | 2620 | ||
2623 | #ifdef CONFIG_SMP | ||
2624 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2625 | #endif | ||
2626 | set_task_cpu(p, cpu); | 2621 | set_task_cpu(p, cpu); |
2627 | 2622 | ||
2628 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2623 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2652 | { | 2647 | { |
2653 | unsigned long flags; | 2648 | unsigned long flags; |
2654 | struct rq *rq; | 2649 | struct rq *rq; |
2650 | int cpu = get_cpu(); | ||
2651 | |||
2652 | #ifdef CONFIG_SMP | ||
2653 | /* | ||
2654 | * Fork balancing, do it here and not earlier because: | ||
2655 | * - cpus_allowed can change in the fork path | ||
2656 | * - any previously selected cpu might disappear through hotplug | ||
2657 | * | ||
2658 | * We still have TASK_WAKING but PF_STARTING is gone now, meaning | ||
2659 | * ->cpus_allowed is stable, we have preemption disabled, meaning | ||
2660 | * cpu_online_mask is stable. | ||
2661 | */ | ||
2662 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2663 | set_task_cpu(p, cpu); | ||
2664 | #endif | ||
2655 | 2665 | ||
2656 | rq = task_rq_lock(p, &flags); | 2666 | rq = task_rq_lock(p, &flags); |
2657 | BUG_ON(p->state != TASK_WAKING); | 2667 | BUG_ON(p->state != TASK_WAKING); |
@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2665 | p->sched_class->task_woken(rq, p); | 2675 | p->sched_class->task_woken(rq, p); |
2666 | #endif | 2676 | #endif |
2667 | task_rq_unlock(rq, &flags); | 2677 | task_rq_unlock(rq, &flags); |
2678 | put_cpu(); | ||
2668 | } | 2679 | } |
2669 | 2680 | ||
2670 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2681 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7139 | * the ->cpus_allowed mask from under waking tasks, which would be | 7150 | * the ->cpus_allowed mask from under waking tasks, which would be |
7140 | * possible when we change rq->lock in ttwu(), so synchronize against | 7151 | * possible when we change rq->lock in ttwu(), so synchronize against |
7141 | * TASK_WAKING to avoid that. | 7152 | * TASK_WAKING to avoid that. |
7153 | * | ||
7154 | * Make an exception for freshly cloned tasks, since cpuset namespaces | ||
7155 | * might move the task about, we have to validate the target in | ||
7156 | * wake_up_new_task() anyway since the cpu might have gone away. | ||
7142 | */ | 7157 | */ |
7143 | again: | 7158 | again: |
7144 | while (p->state == TASK_WAKING) | 7159 | while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) |
7145 | cpu_relax(); | 7160 | cpu_relax(); |
7146 | 7161 | ||
7147 | rq = task_rq_lock(p, &flags); | 7162 | rq = task_rq_lock(p, &flags); |
7148 | 7163 | ||
7149 | if (p->state == TASK_WAKING) { | 7164 | if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { |
7150 | task_rq_unlock(rq, &flags); | 7165 | task_rq_unlock(rq, &flags); |
7151 | goto again; | 7166 | goto again; |
7152 | } | 7167 | } |