aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-03-24 13:34:10 -0400
committerIngo Molnar <mingo@elte.hu>2010-04-02 14:12:03 -0400
commit0017d735092844118bef006696a750a0e4ef6ebd (patch)
tree8ed1540aaeb63da726f93da12950a8eaa0e0a3e0
parent9084bb8246ea935b98320554229e2f371f7f52fa (diff)
sched: Fix TASK_WAKING vs fork deadlock
Oleg noticed a few races with the TASK_WAKING usage on fork. - since TASK_WAKING is basically a spinlock, it should be IRQ safe - since we set TASK_WAKING (*) without holding rq->lock it could be there still is a rq->lock holder, thereby not actually providing full serialization. (*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. Cure the second issue by not setting TASK_WAKING in sched_fork(), but only temporarily in wake_up_new_task() while calling select_task_rq(). Cure the first by holding rq->lock around the select_task_rq() call, this will disable IRQs, this however requires that we push down the rq->lock release into select_task_rq_fair()'s cgroup stuff. Because select_task_rq_fair() still needs to drop the rq->lock we cannot fully get rid of TASK_WAKING. Reported-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched.c65
-rw-r--r--kernel/sched_fair.c8
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c5
5 files changed, 36 insertions, 48 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bea40725c76..fb6c18843ee8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,7 +1046,8 @@ struct sched_class {
1046 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1046 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1047 1047
1048#ifdef CONFIG_SMP 1048#ifdef CONFIG_SMP
1049 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1049 int (*select_task_rq)(struct rq *rq, struct task_struct *p,
1050 int sd_flag, int flags);
1050 1051
1051 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1052 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1052 void (*post_schedule) (struct rq *this_rq); 1053 void (*post_schedule) (struct rq *this_rq);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a38c7a24ed7..dcd17736dae1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -916,14 +916,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
916/* 916/*
917 * Check whether the task is waking, we use this to synchronize against 917 * Check whether the task is waking, we use this to synchronize against
918 * ttwu() so that task_cpu() reports a stable number. 918 * ttwu() so that task_cpu() reports a stable number.
919 *
920 * We need to make an exception for PF_STARTING tasks because the fork
921 * path might require task_rq_lock() to work, eg. it can call
922 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
923 */ 919 */
924static inline int task_is_waking(struct task_struct *p) 920static inline int task_is_waking(struct task_struct *p)
925{ 921{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 922 return unlikely(p->state == TASK_WAKING);
927} 923}
928 924
929/* 925/*
@@ -2320,9 +2316,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2320 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2316 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2321 */ 2317 */
2322static inline 2318static inline
2323int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2319int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2324{ 2320{
2325 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2321 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2326 2322
2327 /* 2323 /*
2328 * In order not to call set_task_cpu() on a blocking task we need 2324 * In order not to call set_task_cpu() on a blocking task we need
@@ -2393,17 +2389,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2393 if (p->sched_class->task_waking) 2389 if (p->sched_class->task_waking)
2394 p->sched_class->task_waking(rq, p); 2390 p->sched_class->task_waking(rq, p);
2395 2391
2396 __task_rq_unlock(rq); 2392 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2397 2393 if (cpu != orig_cpu)
2398 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2399 if (cpu != orig_cpu) {
2400 /*
2401 * Since we migrate the task without holding any rq->lock,
2402 * we need to be careful with task_rq_lock(), since that
2403 * might end up locking an invalid rq.
2404 */
2405 set_task_cpu(p, cpu); 2394 set_task_cpu(p, cpu);
2406 } 2395 __task_rq_unlock(rq);
2407 2396
2408 rq = cpu_rq(cpu); 2397 rq = cpu_rq(cpu);
2409 raw_spin_lock(&rq->lock); 2398 raw_spin_lock(&rq->lock);
@@ -2530,11 +2519,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2530 2519
2531 __sched_fork(p); 2520 __sched_fork(p);
2532 /* 2521 /*
2533 * We mark the process as waking here. This guarantees that 2522 * We mark the process as running here. This guarantees that
2534 * nobody will actually run it, and a signal or other external 2523 * nobody will actually run it, and a signal or other external
2535 * event cannot wake it up and insert it on the runqueue either. 2524 * event cannot wake it up and insert it on the runqueue either.
2536 */ 2525 */
2537 p->state = TASK_WAKING; 2526 p->state = TASK_RUNNING;
2538 2527
2539 /* 2528 /*
2540 * Revert to default priority/policy on fork if requested. 2529 * Revert to default priority/policy on fork if requested.
@@ -2601,28 +2590,25 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2601 int cpu __maybe_unused = get_cpu(); 2590 int cpu __maybe_unused = get_cpu();
2602 2591
2603#ifdef CONFIG_SMP 2592#ifdef CONFIG_SMP
2593 rq = task_rq_lock(p, &flags);
2594 p->state = TASK_WAKING;
2595
2604 /* 2596 /*
2605 * Fork balancing, do it here and not earlier because: 2597 * Fork balancing, do it here and not earlier because:
2606 * - cpus_allowed can change in the fork path 2598 * - cpus_allowed can change in the fork path
2607 * - any previously selected cpu might disappear through hotplug 2599 * - any previously selected cpu might disappear through hotplug
2608 * 2600 *
2609 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2601 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2610 * ->cpus_allowed is stable, we have preemption disabled, meaning 2602 * without people poking at ->cpus_allowed.
2611 * cpu_online_mask is stable.
2612 */ 2603 */
2613 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2604 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2614 set_task_cpu(p, cpu); 2605 set_task_cpu(p, cpu);
2615#endif
2616
2617 /*
2618 * Since the task is not on the rq and we still have TASK_WAKING set
2619 * nobody else will migrate this task.
2620 */
2621 rq = cpu_rq(cpu);
2622 raw_spin_lock_irqsave(&rq->lock, flags);
2623 2606
2624 BUG_ON(p->state != TASK_WAKING);
2625 p->state = TASK_RUNNING; 2607 p->state = TASK_RUNNING;
2608 task_rq_unlock(rq, &flags);
2609#endif
2610
2611 rq = task_rq_lock(p, &flags);
2626 activate_task(rq, p, 0); 2612 activate_task(rq, p, 0);
2627 trace_sched_wakeup_new(rq, p, 1); 2613 trace_sched_wakeup_new(rq, p, 1);
2628 check_preempt_curr(rq, p, WF_FORK); 2614 check_preempt_curr(rq, p, WF_FORK);
@@ -3068,19 +3054,15 @@ void sched_exec(void)
3068{ 3054{
3069 struct task_struct *p = current; 3055 struct task_struct *p = current;
3070 struct migration_req req; 3056 struct migration_req req;
3071 int dest_cpu, this_cpu;
3072 unsigned long flags; 3057 unsigned long flags;
3073 struct rq *rq; 3058 struct rq *rq;
3074 3059 int dest_cpu;
3075 this_cpu = get_cpu();
3076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3077 if (dest_cpu == this_cpu) {
3078 put_cpu();
3079 return;
3080 }
3081 3060
3082 rq = task_rq_lock(p, &flags); 3061 rq = task_rq_lock(p, &flags);
3083 put_cpu(); 3062 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3063 if (dest_cpu == smp_processor_id())
3064 goto unlock;
3065
3084 /* 3066 /*
3085 * select_task_rq() can race against ->cpus_allowed 3067 * select_task_rq() can race against ->cpus_allowed
3086 */ 3068 */
@@ -3098,6 +3080,7 @@ void sched_exec(void)
3098 3080
3099 return; 3081 return;
3100 } 3082 }
3083unlock:
3101 task_rq_unlock(rq, &flags); 3084 task_rq_unlock(rq, &flags);
3102} 3085}
3103 3086
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 49ad99378f82..8a5e7632d09b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1423,7 +1423,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1423 * 1423 *
1424 * preempt must be disabled. 1424 * preempt must be disabled.
1425 */ 1425 */
1426static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1426static int
1427select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1427{ 1428{
1428 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1429 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1429 int cpu = smp_processor_id(); 1430 int cpu = smp_processor_id();
@@ -1521,8 +1522,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1521 cpumask_weight(sched_domain_span(sd)))) 1522 cpumask_weight(sched_domain_span(sd))))
1522 tmp = affine_sd; 1523 tmp = affine_sd;
1523 1524
1524 if (tmp) 1525 if (tmp) {
1526 raw_spin_unlock(&rq->lock);
1525 update_shares(tmp); 1527 update_shares(tmp);
1528 raw_spin_lock(&rq->lock);
1529 }
1526 } 1530 }
1527#endif 1531#endif
1528 1532
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..5af709f503b0 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 012d69bb67c7..fde895f8044d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956