sched: de-SCHED_OTHER-ize the RT path

The current wake-up code path tries to determine if it can optimize the wake-up to "this_cpu" by computing load calculations. The problem is that these calculations are only relevant to SCHED_OTHER tasks where load is king. For RT tasks, priority is king. So the load calculation is completely wasted bandwidth. Therefore, we create a new sched_class interface to help with pre-wakeup routing decisions and move the load calculation as a function of CFS task's class. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Gregory Haskins <ghaskins@novell.com> 2008-01-25 15:08:09 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:09 -0500
commit: e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree: 078940540641a59aaf199695bfc6de3f062a987b /kernel
parent: 697f0a487f294e634a342764472b79375bb3158a (diff)
4 files changed, 194 insertions, 140 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 66e99b419b31..3344ba776b97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_avg_load_per_task(int cpu);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#endif /* CONFIG_SMP */
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 /*
 * Is this task likely cache-hot:
 */
-static inline int
+static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
        s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
 /*
 * Return the average load per task on the cpu's run queue
 */
-static inline unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
 #endif /* CONFIG_SMP */
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-        cpumask_t tmp;
-        struct sched_domain *sd;
-        int i;
-        /*
-         * If it is idle, then it is the best cpu to run this task.
-         *
-         * This cpu is also the best, if it has more than one task already.
-         * Siblings must be also busy(in most cases) as they didn't already
-         * pickup the extra load from this cpu and hence we need not check
-         * sibling runqueue info. This will avoid the checks and cache miss
-         * penalities associated with that.
-         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
-                return cpu;
-        for_each_domain(cpu, sd) {
-                if (sd->flags & SD_WAKE_IDLE) {
-                        cpus_and(tmp, sd->span, p->cpus_allowed);
-                        for_each_cpu_mask(i, tmp) {
-                                if (idle_cpu(i)) {
-                                        if (i != task_cpu(p)) {
-                                                schedstat_inc(p,
-                                                        se.nr_wakeups_idle);
-                                        }
-                                        return i;
-                                }
-                        }
-                } else {
-                        break;
-                }
-        }
-        return cpu;
-}
-#else
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-        return cpu;
-}
-#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        long old_state;
        struct rq *rq;
 #ifdef CONFIG_SMP
-        struct sched_domain *sd, *this_sd = NULL;
-        unsigned long load, this_load;
        int new_cpu;
 #endif
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
-        new_cpu = cpu;
+        new_cpu = p->sched_class->select_task_rq(p, sync);
-        schedstat_inc(rq, ttwu_count);
-        if (cpu == this_cpu) {
-                schedstat_inc(rq, ttwu_local);
-                goto out_set_cpu;
-        }
-        for_each_domain(this_cpu, sd) {
-                if (cpu_isset(cpu, sd->span)) {
-                        schedstat_inc(sd, ttwu_wake_remote);
-                        this_sd = sd;
-                        break;
-                }
-        }
-        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-                goto out_set_cpu;
-        /*
-         * Check for affine wakeup and passive balancing possibilities.
-         */
-        if (this_sd) {
-                int idx = this_sd->wake_idx;
-                unsigned int imbalance;
-                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-                load = source_load(cpu, idx);
-                this_load = target_load(this_cpu, idx);
-                new_cpu = this_cpu; /* Wake to this CPU if we can */
-                if (this_sd->flags & SD_WAKE_AFFINE) {
-                        unsigned long tl = this_load;
-                        unsigned long tl_per_task;
-                        /*
-                         * Attract cache-cold tasks on sync wakeups:
-                         */
-                        if (sync && !task_hot(p, rq->clock, this_sd))
-                                goto out_set_cpu;
-                        schedstat_inc(p, se.nr_wakeups_affine_attempts);
-                        tl_per_task = cpu_avg_load_per_task(this_cpu);
-                        /*
-                         * If sync wakeup then subtract the (maximum possible)
-                         * effect of the currently running task from the load
-                         * of the current CPU:
-                         */
-                        if (sync)
-                                tl -= current->se.load.weight;
-                        if ((tl <= load &&
-                                tl + target_load(cpu, idx) <= tl_per_task) ||
-                               100*(tl + p->se.load.weight) <= imbalance*load) {
-                                /*
-                                 * This domain has SD_WAKE_AFFINE and
-                                 * p is cache cold in this domain, and
-                                 * there is no bad imbalance.
-                                 */
-                                schedstat_inc(this_sd, ttwu_move_affine);
-                                schedstat_inc(p, se.nr_wakeups_affine);
-                                goto out_set_cpu;
-                        }
-                }
-                /*
-                 * Start passive balancing when half the imbalance_pct
-                 * limit is reached.
-                 */
-                if (this_sd->flags & SD_WAKE_BALANCE) {
-                        if (imbalance*this_load <= 100*load) {
-                                schedstat_inc(this_sd, ttwu_move_balance);
-                                schedstat_inc(p, se.nr_wakeups_passive);
-                                goto out_set_cpu;
-                        }
-                }
-        }
-        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
-        new_cpu = wake_idle(new_cpu, p);
        if (new_cpu != cpu) {
                set_task_cpu(p, new_cpu);
                task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
                cpu = task_cpu(p);
        }
+#ifdef CONFIG_SCHEDSTATS
+        schedstat_inc(rq, ttwu_count);
+        if (cpu == this_cpu)
+                schedstat_inc(rq, ttwu_local);
+        else {
+                struct sched_domain *sd;
+                for_each_domain(this_cpu, sd) {
+                        if (cpu_isset(cpu, sd->span)) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+        }
+#endif
 out_activate:
 #endif /* CONFIG_SMP */
        schedstat_inc(p, se.nr_wakeups);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c208e090ae4..f881fc5e035c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq)
 }
 /*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+        cpumask_t tmp;
+        struct sched_domain *sd;
+        int i;
+        /*
+         * If it is idle, then it is the best cpu to run this task.
+         *
+         * This cpu is also the best, if it has more than one task already.
+         * Siblings must be also busy(in most cases) as they didn't already
+         * pickup the extra load from this cpu and hence we need not check
+         * sibling runqueue info. This will avoid the checks and cache miss
+         * penalities associated with that.
+         */
+        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+                return cpu;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & SD_WAKE_IDLE) {
+                        cpus_and(tmp, sd->span, p->cpus_allowed);
+                        for_each_cpu_mask(i, tmp) {
+                                if (idle_cpu(i)) {
+                                        if (i != task_cpu(p)) {
+                                                schedstat_inc(p,
+                                                       se.nr_wakeups_idle);
+                                        }
+                                        return i;
+                                }
+                        }
+                } else {
+                        break;
+                }
+        }
+        return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+        return cpu;
+}
+#endif
+#ifdef CONFIG_SMP
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+        int cpu, this_cpu;
+        struct rq *rq;
+        struct sched_domain *sd, *this_sd = NULL;
+        int new_cpu;
+        cpu      = task_cpu(p);
+        rq       = task_rq(p);
+        this_cpu = smp_processor_id();
+        new_cpu  = cpu;
+        for_each_domain(this_cpu, sd) {
+                if (cpu_isset(cpu, sd->span)) {
+                        this_sd = sd;
+                        break;
+                }
+        }
+        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+                goto out_set_cpu;
+        /*
+         * Check for affine wakeup and passive balancing possibilities.
+         */
+        if (this_sd) {
+                int idx = this_sd->wake_idx;
+                unsigned int imbalance;
+                unsigned long load, this_load;
+                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+                load = source_load(cpu, idx);
+                this_load = target_load(this_cpu, idx);
+                new_cpu = this_cpu; /* Wake to this CPU if we can */
+                if (this_sd->flags & SD_WAKE_AFFINE) {
+                        unsigned long tl = this_load;
+                        unsigned long tl_per_task;
+                        /*
+                         * Attract cache-cold tasks on sync wakeups:
+                         */
+                        if (sync && !task_hot(p, rq->clock, this_sd))
+                                goto out_set_cpu;
+                        schedstat_inc(p, se.nr_wakeups_affine_attempts);
+                        tl_per_task = cpu_avg_load_per_task(this_cpu);
+                        /*
+                         * If sync wakeup then subtract the (maximum possible)
+                         * effect of the currently running task from the load
+                         * of the current CPU:
+                         */
+                        if (sync)
+                                tl -= current->se.load.weight;
+                        if ((tl <= load &&
+                                tl + target_load(cpu, idx) <= tl_per_task) ||
+                               100*(tl + p->se.load.weight) <= imbalance*load) {
+                                /*
+                                 * This domain has SD_WAKE_AFFINE and
+                                 * p is cache cold in this domain, and
+                                 * there is no bad imbalance.
+                                 */
+                                schedstat_inc(this_sd, ttwu_move_affine);
+                                schedstat_inc(p, se.nr_wakeups_affine);
+                                goto out_set_cpu;
+                        }
+                }
+                /*
+                 * Start passive balancing when half the imbalance_pct
+                 * limit is reached.
+                 */
+                if (this_sd->flags & SD_WAKE_BALANCE) {
+                        if (imbalance*this_load <= 100*load) {
+                                schedstat_inc(this_sd, ttwu_move_balance);
+                                schedstat_inc(p, se.nr_wakeups_passive);
+                                goto out_set_cpu;
+                        }
+                }
+        }
+        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+        return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+/*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_fair,
+#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_wakeup,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..ca5374860aef 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
 *  handled in sched_fair.c)
 */
+#ifdef CONFIG_SMP
+static int select_task_rq_idle(struct task_struct *p, int sync)
+{
+        return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
 */
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = {
        /* dequeue is not valid, we print a debug message there: */
        .dequeue_task           = dequeue_task_idle,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_idle,
+#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_curr_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b788e35ffd3f..5de1aebdbd1b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq)
        requeue_task_rt(rq, rq->curr);
 }
+#ifdef CONFIG_SMP
+static int select_task_rq_rt(struct task_struct *p, int sync)
+{
+        return task_cpu(p);
+}
+#endif /* CONFIG_SMP */
 /*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = {
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
        .yield_task             = yield_task_rt,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_rt,
+#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_curr_rt,
author	Gregory Haskins <ghaskins@novell.com>	2008-01-25 15:08:09 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:09 -0500
commit	e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree	078940540641a59aaf199695bfc6de3f062a987b /kernel
parent	697f0a487f294e634a342764472b79375bb3158a (diff)