Merge branch 'sched/locking' into sched/core

Merge reason: the rq locking changes are stable, propagate them into the .40 queue. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-04-18 08:53:18 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-04-18 08:53:33 -0400
commit: 6ddafdaab3f809b110ada253d2f2d4910ebd3ac5 (patch)
tree: 366bb7513511a05b6e11ab89bfe3b2dbd1d62a03 /kernel/sched.c
parent: 3905c54f2bd2c6f937f87307987ca072eabc3e7b (diff)
parent: bd8e7dded88a3e1c085c333f19ff31387616f71a (diff)
1 files changed, 353 insertions, 297 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 506cb8147c70..0cfe0310ed5d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -312,6 +312,9 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -554,6 +557,10 @@ struct rq {
        unsigned int ttwu_count;
        unsigned int ttwu_local;
 #endif
+#ifdef CONFIG_SMP
+        struct task_struct *wake_list;
+#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -597,7 +604,7 @@ static inline int cpu_of(struct rq *rq)
 * Return the group to which this tasks belongs.
 *
 * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
 * holds that lock for each task it moves into the cgroup. Therefore
 * by holding that lock, we pin the task to the current cgroup.
 */
@@ -607,7 +614,7 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&task_rq(p)->lock));
+                        lockdep_is_held(&p->pi_lock));
        tg = container_of(css, struct task_group, css);
        return autogroup_task_group(p, tg);
@@ -839,18 +846,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
        return task_current(rq, p);
+#endif
 }
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -866,15 +894,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->oncpu;
-#else
-        return task_current(rq, p);
-#endif
-}
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -883,7 +902,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
         * SMP rebalancing from interrupt is the only thing that cares
         * here.
         */
-        next->oncpu = 1;
+        next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        raw_spin_unlock_irq(&rq->lock);
@@ -896,12 +915,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
        /*
-         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
         */
        smp_wmb();
-        prev->oncpu = 0;
+        prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
@@ -910,23 +929,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * __task_rq_lock - lock the rq @p resides on.
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-        return unlikely(p->state == TASK_WAKING);
-}
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
        struct rq *rq;
+        lockdep_assert_held(&p->pi_lock);
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
@@ -937,22 +948,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 }
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
 */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+        __acquires(p->pi_lock)
        __acquires(rq->lock)
 {
        struct rq *rq;
        for (;;) {
-                local_irq_save(*flags);
+                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                raw_spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock(&rq->lock);
+                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
@@ -962,10 +973,13 @@ static void __task_rq_unlock(struct rq *rq)
        raw_spin_unlock(&rq->lock);
 }
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
        __releases(rq->lock)
+        __releases(p->pi_lock)
 {
-        raw_spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
@@ -1774,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, flags);
-        p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1782,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, flags);
-        p->se.on_rq = 0;
 }
 /*
@@ -2117,7 +2129,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -2163,6 +2175,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#ifdef CONFIG_LOCKDEP
+        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
        trace_sched_migrate_task(p, new_cpu);
@@ -2183,19 +2200,6 @@ struct migration_arg {
 static int migration_cpu_stop(void *data);
 /*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
-{
-        /*
-         * If the task is not on a runqueue (and not running), then
-         * the next wake-up will properly place the task.
-         */
-        return p->se.on_rq || task_running(rq, p);
-}
-/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2252,11 +2256,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->se.on_rq;
+                on_rq = p->on_rq;
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                /*
                 * If it changed from the expected state, bail out now.
@@ -2331,7 +2335,7 @@ EXPORT_SYMBOL_GPL(kick_process);
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2364,12 +2368,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2395,27 +2399,60 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+static void
-                                 bool is_sync, bool is_migrate, bool is_local,
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-                                 unsigned long en_flags)
 {
+#ifdef CONFIG_SCHEDSTATS
+        struct rq *rq = this_rq();
+#ifdef CONFIG_SMP
+        int this_cpu = smp_processor_id();
+        if (cpu == this_cpu) {
+                schedstat_inc(rq, ttwu_local);
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
+        } else {
+                struct sched_domain *sd;
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                for_each_domain(this_cpu, sd) {
+                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+        }
+#endif /* CONFIG_SMP */
+        schedstat_inc(rq, ttwu_count);
        schedstat_inc(p, se.statistics.nr_wakeups);
-        if (is_sync)
+        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (is_migrate)
+        if (cpu != task_cpu(p))
                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-        if (is_local)
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+#endif /* CONFIG_SCHEDSTATS */
+}
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
        activate_task(rq, p, en_flags);
+        p->on_rq = 1;
+        /* if a worker is waking up, notify workqueue */
+        if (p->flags & PF_WQ_WORKER)
+                wq_worker_waking_up(p, cpu_of(rq));
 }
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+/*
-                                        int wake_flags, bool success)
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, success);
+        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2434,9 +2471,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                rq->idle_stamp = 0;
        }
 #endif
-        /* if a worker is waking up, notify workqueue */
+}
-        if ((p->flags & PF_WQ_WORKER) && success)
-                wq_worker_waking_up(p, cpu_of(rq));
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+        if (p->sched_contributes_to_load)
+                rq->nr_uninterruptible--;
+#endif
+        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+        ttwu_do_wakeup(rq, p, wake_flags);
+}
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_rq) {
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        raw_spin_lock(&rq->lock);
+        while (list) {
+                struct task_struct *p = list;
+                list = list->wake_entry;
+                ttwu_do_activate(rq, p, 0);
+        }
+        raw_spin_unlock(&rq->lock);
+}
+void scheduler_ipi(void)
+{
+        sched_ttwu_pending();
+}
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        struct task_struct *next = rq->wake_list;
+        for (;;) {
+                struct task_struct *old = next;
+                p->wake_entry = next;
+                next = cmpxchg(&rq->wake_list, old, p);
+                if (next == old)
+                        break;
+        }
+        if (!next)
+                smp_send_reschedule(cpu);
+}
+#endif
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
+        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                ttwu_queue_remote(p, cpu);
+                return;
+        }
+#endif
+        raw_spin_lock(&rq->lock);
+        ttwu_do_activate(rq, p, 0);
+        raw_spin_unlock(&rq->lock);
 }
 /**
@@ -2454,92 +2581,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 * Returns %true if @p was woken up, %false if it was already running
 * or @state didn't match @p's state.
 */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
+static int
-                          int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        unsigned long en_flags = ENQUEUE_WAKEUP;
+        int cpu, success = 0;
-        struct rq *rq;
-        this_cpu = get_cpu();
        smp_wmb();
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
-        if (p->se.on_rq)
+        success = 1; /* we're going to change ->state */
-                goto out_running;
        cpu = task_cpu(p);
-        orig_cpu = cpu;
-#ifdef CONFIG_SMP
+        if (p->on_rq && ttwu_remote(p, wake_flags))
-        if (unlikely(task_running(rq, p)))
+                goto stat;
-                goto out_activate;
+#ifdef CONFIG_SMP
        /*
-         * In order to handle concurrent wakeups and release the rq->lock
+         * If the owning (remote) cpu is still in the middle of schedule() with
-         * we put the task in TASK_WAKING state.
+         * this task as prev, wait until its done referencing the task.
-         *
-         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p)) {
+        while (p->on_cpu) {
-                if (likely(cpu_online(orig_cpu)))
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-                        rq->nr_uninterruptible--;
+                /*
-                else
+                 * If called from interrupt context we could have landed in the
-                        this_rq()->nr_uninterruptible--;
+                 * middle of schedule(), in this case we should take care not
-        }
+                 * to spin on ->on_cpu if p is current, since that would
-        p->state = TASK_WAKING;
+                 * deadlock.
+                 */
-        if (p->sched_class->task_waking) {
+                if (p == current) {
-                p->sched_class->task_waking(rq, p);
+                        ttwu_queue(p, cpu);
-                en_flags |= ENQUEUE_WAKING;
+                        goto stat;
+                }
+#endif
+                cpu_relax();
        }
-        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
-                set_task_cpu(p, cpu);
-        __task_rq_unlock(rq);
-        rq = cpu_rq(cpu);
-        raw_spin_lock(&rq->lock);
        /*
-         * We migrated the task without holding either rq->lock, however
+         * Pairs with the smp_wmb() in finish_lock_switch().
-         * since the task is not on the task list itself, nobody else
-         * will try and migrate the task, hence the rq should match the
-         * cpu we just moved it to.
         */
-        WARN_ON(task_cpu(p) != cpu);
+        smp_rmb();
-        WARN_ON(p->state != TASK_WAKING);
-#ifdef CONFIG_SCHEDSTATS
+        p->sched_contributes_to_load = !!task_contributes_to_load(p);
-        schedstat_inc(rq, ttwu_count);
+        p->state = TASK_WAKING;
-        if (cpu == this_cpu)
-                schedstat_inc(rq, ttwu_local);
-        else {
-                struct sched_domain *sd;
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
-                                break;
-                        }
-                }
-        }
-#endif /* CONFIG_SCHEDSTATS */
-out_activate:
+        if (p->sched_class->task_waking)
+                p->sched_class->task_waking(p);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (task_cpu(p) != cpu)
+                set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
-        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                      cpu == this_cpu, en_flags);
+        ttwu_queue(p, cpu);
-        success = 1;
+stat:
-out_running:
+        ttwu_stat(p, cpu, wake_flags);
-        ttwu_post_activation(p, rq, wake_flags, success);
 out:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-        put_cpu();
        return success;
 }
@@ -2548,31 +2647,34 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not already there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
 */
 static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        bool success = false;
        BUG_ON(rq != this_rq());
        BUG_ON(p == current);
        lockdep_assert_held(&rq->lock);
+        if (!raw_spin_trylock(&p->pi_lock)) {
+                raw_spin_unlock(&rq->lock);
+                raw_spin_lock(&p->pi_lock);
+                raw_spin_lock(&rq->lock);
+        }
        if (!(p->state & TASK_NORMAL))
-                return;
+                goto out;
-        if (!p->se.on_rq) {
+        if (!p->on_rq)
-                if (likely(!task_running(rq, p))) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-                        schedstat_inc(rq, ttwu_count);
-                        schedstat_inc(rq, ttwu_local);
+        ttwu_do_wakeup(rq, p, 0);
-                }
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+out:
-                success = true;
+        raw_spin_unlock(&p->pi_lock);
-        }
-        ttwu_post_activation(p, rq, 0, success);
 }
 /**
@@ -2605,19 +2707,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 */
 static void __sched_fork(struct task_struct *p)
 {
+        p->on_rq                        = 0;
+        p->se.on_rq                     = 0;
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
-        p->se.on_rq = 0;
-        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2629,6 +2733,7 @@ static void __sched_fork(struct task_struct *p)
 */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
+        unsigned long flags;
        int cpu = get_cpu();
        __sched_fork(p);
@@ -2679,16 +2784,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
         *
         * Silence PROVE_RCU.
         */
-        rcu_read_lock();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        set_task_cpu(p, cpu);
-        rcu_read_unlock();
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        p->oncpu = 0;
+        p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
@@ -2712,37 +2817,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
-        int cpu __maybe_unused = get_cpu();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-        rq = task_rq_lock(p, &flags);
-        p->state = TASK_WAKING;
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
-         *
-         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-         * without people poking at ->cpus_allowed.
         */
-        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-        set_task_cpu(p, cpu);
-        p->state = TASK_RUNNING;
-        task_rq_unlock(rq, &flags);
 #endif
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(p, 1);
+        p->on_rq = 1;
+        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
-        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3451,27 +3546,22 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        unsigned long flags;
-        struct rq *rq;
        int dest_cpu;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
-        /*
+        if (likely(cpu_active(dest_cpu))) {
-         * select_task_rq() can race against ->cpus_allowed
-         */
-        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
-                task_rq_unlock(rq, &flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                return;
        }
 unlock:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
@@ -3508,7 +3598,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3526,7 +3616,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3550,7 +3640,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        thread_group_cputime(p, &totals);
        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -4036,7 +4126,7 @@ static inline void schedule_debug(struct task_struct *prev)
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->se.on_rq)
+        if (prev->on_rq)
                update_rq_clock(rq);
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -4098,11 +4188,13 @@ need_resched:
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        prev->on_rq = 0;
                        /*
-                         * If a worker is going to sleep, notify and
+                         * If a worker went to sleep, notify and ask workqueue
-                         * ask workqueue whether it wants to wake up a
+                         * whether it wants to wake up a task to maintain
-                         * task to maintain concurrency.  If so, wake
+                         * concurrency.
-                         * up the task.
                         */
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
@@ -4111,21 +4203,20 @@ need_resched:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        /*
+                         * If we are going to sleep and we have plugged IO
+                         * queued, make sure to submit it to avoid deadlocks.
+                         */
+                        if (blk_needs_flush_plug(prev)) {
+                                raw_spin_unlock(&rq->lock);
+                                blk_flush_plug(prev);
+                                raw_spin_lock(&rq->lock);
+                        }
                }
                switch_count = &prev->nvcsw;
        }
-        /*
-         * If we are going to sleep and we have plugged IO queued, make
-         * sure to submit it to avoid deadlocks.
-         */
-        if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
-                raw_spin_unlock(&rq->lock);
-                blk_flush_plug(prev);
-                raw_spin_lock(&rq->lock);
-        }
        pre_schedule(rq, prev);
        if (unlikely(!rq->nr_running))
@@ -4162,70 +4253,53 @@ need_resched:
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
-        unsigned int cpu;
-        struct rq *rq;
-        if (!sched_feat(OWNER_SPIN))
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-                return 0;
+{
+        bool ret = false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+        rcu_read_lock();
-        /*
+        if (lock->owner != owner)
-         * Need to access the cpu field knowing that
+                goto fail;
-         * DEBUG_PAGEALLOC could have unmapped it if
-         * the mutex owner just released it and exited.
-         */
-        if (probe_kernel_address(&owner->cpu, cpu))
-                return 0;
-#else
-        cpu = owner->cpu;
-#endif
        /*
-         * Even if the access succeeded (likely case),
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * the cpu field may no longer be valid.
+         * lock->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
         */
-        if (cpu >= nr_cpumask_bits)
+        barrier();
-                return 0;
-        /*
+        ret = owner->on_cpu;
-         * We need to validate that we can do a
+fail:
-         * get_cpu() and that we have the percpu area.
+        rcu_read_unlock();
-         */
-        if (!cpu_online(cpu))
-                return 0;
-        rq = cpu_rq(cpu);
+        return ret;
+}
-        for (;;) {
+/*
-                /*
+ * Look out! "owner" is an entirely speculative pointer
-                 * Owner changed, break to re-assess state.
+ * access and not reliable.
-                 */
+ */
-                if (lock->owner != owner) {
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-                        /*
+{
-                         * If the lock has switched to a different owner,
+        if (!sched_feat(OWNER_SPIN))
-                         * we likely have heavy contention. Return 0 to quit
+                return 0;
-                         * optimistic spinning and not contend further:
-                         */
-                        if (lock->owner)
-                                return 0;
-                        break;
-                }
-                /*
+        while (owner_running(lock, owner)) {
-                 * Is that owner really running on that cpu?
+                if (need_resched())
-                 */
-                if (task_thread_info(rq->curr) != owner || need_resched())
                        return 0;
                arch_mutex_cpu_relax();
        }
+        /*
+         * If the owner changed to another task there is likely
+         * heavy contention, stop spinning.
+         */
+        if (lock->owner)
+                return 0;
        return 1;
 }
 #endif
@@ -4685,19 +4759,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4717,7 +4790,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
        check_class_changed(rq, p, prev_class, oldprio);
-        task_rq_unlock(rq, &flags);
+        __task_rq_unlock(rq);
 }
 #endif
@@ -4745,7 +4818,7 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4765,7 +4838,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_task(rq->curr);
        }
 out_unlock:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -4879,8 +4952,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-        BUG_ON(p->se.on_rq);
        p->policy = policy;
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
@@ -4995,20 +5066,17 @@ recheck:
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
-         */
+         *
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        /*
         * To be able to change p->policy safely, the appropriate
         * runqueue lock must be held.
         */
-        rq = __task_rq_lock(p);
+        rq = task_rq_lock(p, &flags);
        /*
         * Changing the policy of the stop threads its a very bad idea
         */
        if (p == rq->stop) {
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                return -EINVAL;
        }
@@ -5032,8 +5100,7 @@ recheck:
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                        __task_rq_unlock(rq);
+                        task_rq_unlock(rq, p, &flags);
-                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
                }
        }
@@ -5042,11 +5109,10 @@ recheck:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -5065,8 +5131,7 @@ recheck:
                activate_task(rq, p, 0);
        check_class_changed(rq, p, prev_class, oldprio);
-        __task_rq_unlock(rq);
+        task_rq_unlock(rq, p, &flags);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -5317,7 +5382,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        unsigned long flags;
-        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -5332,9 +5396,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
@@ -5659,7 +5723,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        rq = task_rq_lock(p, &flags);
        time_slice = p->sched_class->get_rr_interval(rq, p);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5777,8 +5841,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        idle->oncpu = 1;
+        idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5882,18 +5946,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        unsigned int dest_cpu;
        int ret = 0;
-        /*
-         * Serialize against TASK_WAKING so that ttwu() and wunt() can
-         * drop the rq->lock and still rely on ->cpus_allowed.
-         */
-again:
-        while (task_is_waking(p))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (task_is_waking(p)) {
-                task_rq_unlock(rq, &flags);
-                goto again;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
@@ -5918,16 +5971,16 @@ again:
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, rq)) {
+        if (p->on_rq) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ret;
 }
@@ -5955,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
+        raw_spin_lock(&p->pi_lock);
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
@@ -5967,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->se.on_rq) {
+        if (p->on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
                activate_task(rq_dest, p, 0);
@@ -5977,6 +6031,7 @@ done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -6317,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DYING:
+                sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -7961,7 +8017,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
        int old_prio = p->prio;
        int on_rq;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8304,7 +8360,7 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->se.on_rq;
+        on_rq = tsk->on_rq;
        if (on_rq)
                dequeue_task(rq, tsk, 0);
@@ -8323,7 +8379,7 @@ void sched_move_task(struct task_struct *tsk)
        if (on_rq)
                enqueue_task(rq, tsk, 0);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
author	Ingo Molnar <mingo@elte.hu>	2011-04-18 08:53:18 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-04-18 08:53:33 -0400
commit	6ddafdaab3f809b110ada253d2f2d4910ebd3ac5 (patch)
tree	366bb7513511a05b6e11ab89bfe3b2dbd1d62a03 /kernel/sched.c
parent	3905c54f2bd2c6f937f87307987ca072eabc3e7b (diff)
parent	bd8e7dded88a3e1c085c333f19ff31387616f71a (diff)