diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-03-24 13:34:10 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-04-02 14:12:03 -0400 |
commit | 0017d735092844118bef006696a750a0e4ef6ebd (patch) | |
tree | 8ed1540aaeb63da726f93da12950a8eaa0e0a3e0 /kernel | |
parent | 9084bb8246ea935b98320554229e2f371f7f52fa (diff) |
sched: Fix TASK_WAKING vs fork deadlock
Oleg noticed a few races with the TASK_WAKING usage on fork.
- since TASK_WAKING is basically a spinlock, it should be IRQ safe
- since we set TASK_WAKING (*) without holding rq->lock it could
be there still is a rq->lock holder, thereby not actually
providing full serialization.
(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING.
Cure the second issue by not setting TASK_WAKING in sched_fork(), but
only temporarily in wake_up_new_task() while calling select_task_rq().
Cure the first by holding rq->lock around the select_task_rq() call,
this will disable IRQs, this however requires that we push down the
rq->lock release into select_task_rq_fair()'s cgroup stuff.
Because select_task_rq_fair() still needs to drop the rq->lock we
cannot fully get rid of TASK_WAKING.
Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 65 | ||||
-rw-r--r-- | kernel/sched_fair.c | 8 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 3 | ||||
-rw-r--r-- | kernel/sched_rt.c | 5 |
4 files changed, 34 insertions, 47 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 9a38c7a24ed7..dcd17736dae1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -916,14 +916,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
916 | /* | 916 | /* |
917 | * Check whether the task is waking, we use this to synchronize against | 917 | * Check whether the task is waking, we use this to synchronize against |
918 | * ttwu() so that task_cpu() reports a stable number. | 918 | * ttwu() so that task_cpu() reports a stable number. |
919 | * | ||
920 | * We need to make an exception for PF_STARTING tasks because the fork | ||
921 | * path might require task_rq_lock() to work, eg. it can call | ||
922 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
923 | */ | 919 | */ |
924 | static inline int task_is_waking(struct task_struct *p) | 920 | static inline int task_is_waking(struct task_struct *p) |
925 | { | 921 | { |
926 | return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); | 922 | return unlikely(p->state == TASK_WAKING); |
927 | } | 923 | } |
928 | 924 | ||
929 | /* | 925 | /* |
@@ -2320,9 +2316,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2320 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2316 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. |
2321 | */ | 2317 | */ |
2322 | static inline | 2318 | static inline |
2323 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 2319 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) |
2324 | { | 2320 | { |
2325 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 2321 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); |
2326 | 2322 | ||
2327 | /* | 2323 | /* |
2328 | * In order not to call set_task_cpu() on a blocking task we need | 2324 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2393,17 +2389,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2393 | if (p->sched_class->task_waking) | 2389 | if (p->sched_class->task_waking) |
2394 | p->sched_class->task_waking(rq, p); | 2390 | p->sched_class->task_waking(rq, p); |
2395 | 2391 | ||
2396 | __task_rq_unlock(rq); | 2392 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); |
2397 | 2393 | if (cpu != orig_cpu) | |
2398 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2399 | if (cpu != orig_cpu) { | ||
2400 | /* | ||
2401 | * Since we migrate the task without holding any rq->lock, | ||
2402 | * we need to be careful with task_rq_lock(), since that | ||
2403 | * might end up locking an invalid rq. | ||
2404 | */ | ||
2405 | set_task_cpu(p, cpu); | 2394 | set_task_cpu(p, cpu); |
2406 | } | 2395 | __task_rq_unlock(rq); |
2407 | 2396 | ||
2408 | rq = cpu_rq(cpu); | 2397 | rq = cpu_rq(cpu); |
2409 | raw_spin_lock(&rq->lock); | 2398 | raw_spin_lock(&rq->lock); |
@@ -2530,11 +2519,11 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2530 | 2519 | ||
2531 | __sched_fork(p); | 2520 | __sched_fork(p); |
2532 | /* | 2521 | /* |
2533 | * We mark the process as waking here. This guarantees that | 2522 | * We mark the process as running here. This guarantees that |
2534 | * nobody will actually run it, and a signal or other external | 2523 | * nobody will actually run it, and a signal or other external |
2535 | * event cannot wake it up and insert it on the runqueue either. | 2524 | * event cannot wake it up and insert it on the runqueue either. |
2536 | */ | 2525 | */ |
2537 | p->state = TASK_WAKING; | 2526 | p->state = TASK_RUNNING; |
2538 | 2527 | ||
2539 | /* | 2528 | /* |
2540 | * Revert to default priority/policy on fork if requested. | 2529 | * Revert to default priority/policy on fork if requested. |
@@ -2601,28 +2590,25 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2601 | int cpu __maybe_unused = get_cpu(); | 2590 | int cpu __maybe_unused = get_cpu(); |
2602 | 2591 | ||
2603 | #ifdef CONFIG_SMP | 2592 | #ifdef CONFIG_SMP |
2593 | rq = task_rq_lock(p, &flags); | ||
2594 | p->state = TASK_WAKING; | ||
2595 | |||
2604 | /* | 2596 | /* |
2605 | * Fork balancing, do it here and not earlier because: | 2597 | * Fork balancing, do it here and not earlier because: |
2606 | * - cpus_allowed can change in the fork path | 2598 | * - cpus_allowed can change in the fork path |
2607 | * - any previously selected cpu might disappear through hotplug | 2599 | * - any previously selected cpu might disappear through hotplug |
2608 | * | 2600 | * |
2609 | * We still have TASK_WAKING but PF_STARTING is gone now, meaning | 2601 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock |
2610 | * ->cpus_allowed is stable, we have preemption disabled, meaning | 2602 | * without people poking at ->cpus_allowed. |
2611 | * cpu_online_mask is stable. | ||
2612 | */ | 2603 | */ |
2613 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | 2604 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); |
2614 | set_task_cpu(p, cpu); | 2605 | set_task_cpu(p, cpu); |
2615 | #endif | ||
2616 | |||
2617 | /* | ||
2618 | * Since the task is not on the rq and we still have TASK_WAKING set | ||
2619 | * nobody else will migrate this task. | ||
2620 | */ | ||
2621 | rq = cpu_rq(cpu); | ||
2622 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2623 | 2606 | ||
2624 | BUG_ON(p->state != TASK_WAKING); | ||
2625 | p->state = TASK_RUNNING; | 2607 | p->state = TASK_RUNNING; |
2608 | task_rq_unlock(rq, &flags); | ||
2609 | #endif | ||
2610 | |||
2611 | rq = task_rq_lock(p, &flags); | ||
2626 | activate_task(rq, p, 0); | 2612 | activate_task(rq, p, 0); |
2627 | trace_sched_wakeup_new(rq, p, 1); | 2613 | trace_sched_wakeup_new(rq, p, 1); |
2628 | check_preempt_curr(rq, p, WF_FORK); | 2614 | check_preempt_curr(rq, p, WF_FORK); |
@@ -3068,19 +3054,15 @@ void sched_exec(void) | |||
3068 | { | 3054 | { |
3069 | struct task_struct *p = current; | 3055 | struct task_struct *p = current; |
3070 | struct migration_req req; | 3056 | struct migration_req req; |
3071 | int dest_cpu, this_cpu; | ||
3072 | unsigned long flags; | 3057 | unsigned long flags; |
3073 | struct rq *rq; | 3058 | struct rq *rq; |
3074 | 3059 | int dest_cpu; | |
3075 | this_cpu = get_cpu(); | ||
3076 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | ||
3077 | if (dest_cpu == this_cpu) { | ||
3078 | put_cpu(); | ||
3079 | return; | ||
3080 | } | ||
3081 | 3060 | ||
3082 | rq = task_rq_lock(p, &flags); | 3061 | rq = task_rq_lock(p, &flags); |
3083 | put_cpu(); | 3062 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); |
3063 | if (dest_cpu == smp_processor_id()) | ||
3064 | goto unlock; | ||
3065 | |||
3084 | /* | 3066 | /* |
3085 | * select_task_rq() can race against ->cpus_allowed | 3067 | * select_task_rq() can race against ->cpus_allowed |
3086 | */ | 3068 | */ |
@@ -3098,6 +3080,7 @@ void sched_exec(void) | |||
3098 | 3080 | ||
3099 | return; | 3081 | return; |
3100 | } | 3082 | } |
3083 | unlock: | ||
3101 | task_rq_unlock(rq, &flags); | 3084 | task_rq_unlock(rq, &flags); |
3102 | } | 3085 | } |
3103 | 3086 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 49ad99378f82..8a5e7632d09b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1423,7 +1423,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) | |||
1423 | * | 1423 | * |
1424 | * preempt must be disabled. | 1424 | * preempt must be disabled. |
1425 | */ | 1425 | */ |
1426 | static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 1426 | static int |
1427 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | ||
1427 | { | 1428 | { |
1428 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1429 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1429 | int cpu = smp_processor_id(); | 1430 | int cpu = smp_processor_id(); |
@@ -1521,8 +1522,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1521 | cpumask_weight(sched_domain_span(sd)))) | 1522 | cpumask_weight(sched_domain_span(sd)))) |
1522 | tmp = affine_sd; | 1523 | tmp = affine_sd; |
1523 | 1524 | ||
1524 | if (tmp) | 1525 | if (tmp) { |
1526 | raw_spin_unlock(&rq->lock); | ||
1525 | update_shares(tmp); | 1527 | update_shares(tmp); |
1528 | raw_spin_lock(&rq->lock); | ||
1529 | } | ||
1526 | } | 1530 | } |
1527 | #endif | 1531 | #endif |
1528 | 1532 | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a8a6d8a50947..5af709f503b0 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -6,7 +6,8 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
9 | static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | 9 | static int |
10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | ||
10 | { | 11 | { |
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
12 | } | 13 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 012d69bb67c7..fde895f8044d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq) | |||
948 | #ifdef CONFIG_SMP | 948 | #ifdef CONFIG_SMP |
949 | static int find_lowest_rq(struct task_struct *task); | 949 | static int find_lowest_rq(struct task_struct *task); |
950 | 950 | ||
951 | static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | 951 | static int |
952 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | ||
952 | { | 953 | { |
953 | struct rq *rq = task_rq(p); | ||
954 | |||
955 | if (sd_flag != SD_BALANCE_WAKE) | 954 | if (sd_flag != SD_BALANCE_WAKE) |
956 | return smp_processor_id(); | 955 | return smp_processor_id(); |
957 | 956 | ||