sched: de-SCHED_OTHER-ize the RT path

The current wake-up code path tries to determine if it can optimize the wake-up to "this_cpu" by computing load calculations. The problem is that these calculations are only relevant to SCHED_OTHER tasks where load is king. For RT tasks, priority is king. So the load calculation is completely wasted bandwidth. Therefore, we create a new sched_class interface to help with pre-wakeup routing decisions and move the load calculation as a function of CFS task's class. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Gregory Haskins <ghaskins@novell.com> 2008-01-25 15:08:09 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:09 -0500
commit: e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree: 078940540641a59aaf199695bfc6de3f062a987b /kernel/sched_fair.c
parent: 697f0a487f294e634a342764472b79375bb3158a (diff)
1 files changed, 148 insertions, 0 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c208e090ae4..f881fc5e035c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq)
 }
 /*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+        cpumask_t tmp;
+        struct sched_domain *sd;
+        int i;
+        /*
+         * If it is idle, then it is the best cpu to run this task.
+         *
+         * This cpu is also the best, if it has more than one task already.
+         * Siblings must be also busy(in most cases) as they didn't already
+         * pickup the extra load from this cpu and hence we need not check
+         * sibling runqueue info. This will avoid the checks and cache miss
+         * penalities associated with that.
+         */
+        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+                return cpu;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & SD_WAKE_IDLE) {
+                        cpus_and(tmp, sd->span, p->cpus_allowed);
+                        for_each_cpu_mask(i, tmp) {
+                                if (idle_cpu(i)) {
+                                        if (i != task_cpu(p)) {
+                                                schedstat_inc(p,
+                                                       se.nr_wakeups_idle);
+                                        }
+                                        return i;
+                                }
+                        }
+                } else {
+                        break;
+                }
+        }
+        return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+        return cpu;
+}
+#endif
+#ifdef CONFIG_SMP
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+        int cpu, this_cpu;
+        struct rq *rq;
+        struct sched_domain *sd, *this_sd = NULL;
+        int new_cpu;
+        cpu      = task_cpu(p);
+        rq       = task_rq(p);
+        this_cpu = smp_processor_id();
+        new_cpu  = cpu;
+        for_each_domain(this_cpu, sd) {
+                if (cpu_isset(cpu, sd->span)) {
+                        this_sd = sd;
+                        break;
+                }
+        }
+        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+                goto out_set_cpu;
+        /*
+         * Check for affine wakeup and passive balancing possibilities.
+         */
+        if (this_sd) {
+                int idx = this_sd->wake_idx;
+                unsigned int imbalance;
+                unsigned long load, this_load;
+                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+                load = source_load(cpu, idx);
+                this_load = target_load(this_cpu, idx);
+                new_cpu = this_cpu; /* Wake to this CPU if we can */
+                if (this_sd->flags & SD_WAKE_AFFINE) {
+                        unsigned long tl = this_load;
+                        unsigned long tl_per_task;
+                        /*
+                         * Attract cache-cold tasks on sync wakeups:
+                         */
+                        if (sync && !task_hot(p, rq->clock, this_sd))
+                                goto out_set_cpu;
+                        schedstat_inc(p, se.nr_wakeups_affine_attempts);
+                        tl_per_task = cpu_avg_load_per_task(this_cpu);
+                        /*
+                         * If sync wakeup then subtract the (maximum possible)
+                         * effect of the currently running task from the load
+                         * of the current CPU:
+                         */
+                        if (sync)
+                                tl -= current->se.load.weight;
+                        if ((tl <= load &&
+                                tl + target_load(cpu, idx) <= tl_per_task) ||
+                               100*(tl + p->se.load.weight) <= imbalance*load) {
+                                /*
+                                 * This domain has SD_WAKE_AFFINE and
+                                 * p is cache cold in this domain, and
+                                 * there is no bad imbalance.
+                                 */
+                                schedstat_inc(this_sd, ttwu_move_affine);
+                                schedstat_inc(p, se.nr_wakeups_affine);
+                                goto out_set_cpu;
+                        }
+                }
+                /*
+                 * Start passive balancing when half the imbalance_pct
+                 * limit is reached.
+                 */
+                if (this_sd->flags & SD_WAKE_BALANCE) {
+                        if (imbalance*this_load <= 100*load) {
+                                schedstat_inc(this_sd, ttwu_move_balance);
+                                schedstat_inc(p, se.nr_wakeups_passive);
+                                goto out_set_cpu;
+                        }
+                }
+        }
+        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+        return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+/*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
        .yield_task             = yield_task_fair,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_fair,
+#endif /* CONFIG_SMP */
        .check_preempt_curr     = check_preempt_wakeup,
author	Gregory Haskins <ghaskins@novell.com>	2008-01-25 15:08:09 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:09 -0500
commit	e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree	078940540641a59aaf199695bfc6de3f062a987b /kernel/sched_fair.c
parent	697f0a487f294e634a342764472b79375bb3158a (diff)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c208e090ae4..f881fc5e035c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq)
861	}	861	}
862		862
863	/*	863	/*
		864	* wake_idle() will wake a task on an idle cpu if task->cpu is
		865	* not idle and an idle cpu is available. The span of cpus to
		866	* search starts with cpus closest then further out as needed,
		867	* so we always favor a closer, idle cpu.
		868	*
		869	* Returns the CPU we should wake onto.
		870	*/
		871	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
		872	static int wake_idle(int cpu, struct task_struct *p)
		873	{
		874	cpumask_t tmp;
		875	struct sched_domain *sd;
		876	int i;
		877
		878	/*
		879	* If it is idle, then it is the best cpu to run this task.
		880	*
		881	* This cpu is also the best, if it has more than one task already.
		882	* Siblings must be also busy(in most cases) as they didn't already
		883	* pickup the extra load from this cpu and hence we need not check
		884	* sibling runqueue info. This will avoid the checks and cache miss
		885	* penalities associated with that.
		886	*/
		887	if (idle_cpu(cpu) \|\| cpu_rq(cpu)->nr_running > 1)
		888	return cpu;
		889
		890	for_each_domain(cpu, sd) {
		891	if (sd->flags & SD_WAKE_IDLE) {
		892	cpus_and(tmp, sd->span, p->cpus_allowed);
		893	for_each_cpu_mask(i, tmp) {
		894	if (idle_cpu(i)) {
		895	if (i != task_cpu(p)) {
		896	schedstat_inc(p,
		897	se.nr_wakeups_idle);
		898	}
		899	return i;
		900	}
		901	}
		902	} else {
		903	break;
		904	}
		905	}
		906	return cpu;
		907	}
		908	#else
		909	static inline int wake_idle(int cpu, struct task_struct *p)
		910	{
		911	return cpu;
		912	}
		913	#endif
		914
		915	#ifdef CONFIG_SMP
		916	static int select_task_rq_fair(struct task_struct *p, int sync)
		917	{
		918	int cpu, this_cpu;
		919	struct rq *rq;
		920	struct sched_domain sd, this_sd = NULL;
		921	int new_cpu;
		922
		923	cpu = task_cpu(p);
		924	rq = task_rq(p);
		925	this_cpu = smp_processor_id();
		926	new_cpu = cpu;
		927
		928	for_each_domain(this_cpu, sd) {
		929	if (cpu_isset(cpu, sd->span)) {
		930	this_sd = sd;
		931	break;
		932	}
		933	}
		934
		935	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
		936	goto out_set_cpu;
		937
		938	/*
		939	* Check for affine wakeup and passive balancing possibilities.
		940	*/
		941	if (this_sd) {
		942	int idx = this_sd->wake_idx;
		943	unsigned int imbalance;
		944	unsigned long load, this_load;
		945
		946	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
		947
		948	load = source_load(cpu, idx);
		949	this_load = target_load(this_cpu, idx);
		950
		951	new_cpu = this_cpu; /* Wake to this CPU if we can */
		952
		953	if (this_sd->flags & SD_WAKE_AFFINE) {
		954	unsigned long tl = this_load;
		955	unsigned long tl_per_task;
		956
		957	/*
		958	* Attract cache-cold tasks on sync wakeups:
		959	*/
		960	if (sync && !task_hot(p, rq->clock, this_sd))
		961	goto out_set_cpu;
		962
		963	schedstat_inc(p, se.nr_wakeups_affine_attempts);
		964	tl_per_task = cpu_avg_load_per_task(this_cpu);
		965
		966	/*
		967	* If sync wakeup then subtract the (maximum possible)
		968	* effect of the currently running task from the load
		969	* of the current CPU:
		970	*/
		971	if (sync)
		972	tl -= current->se.load.weight;
		973
		974	if ((tl <= load &&
		975	tl + target_load(cpu, idx) <= tl_per_task) \|\|
		976	100(tl + p->se.load.weight) <= imbalanceload) {
		977	/*
		978	* This domain has SD_WAKE_AFFINE and
		979	* p is cache cold in this domain, and
		980	* there is no bad imbalance.
		981	*/
		982	schedstat_inc(this_sd, ttwu_move_affine);
		983	schedstat_inc(p, se.nr_wakeups_affine);
		984	goto out_set_cpu;
		985	}
		986	}
		987
		988	/*
		989	* Start passive balancing when half the imbalance_pct
		990	* limit is reached.
		991	*/
		992	if (this_sd->flags & SD_WAKE_BALANCE) {
		993	if (imbalancethis_load <= 100load) {
		994	schedstat_inc(this_sd, ttwu_move_balance);
		995	schedstat_inc(p, se.nr_wakeups_passive);
		996	goto out_set_cpu;
		997	}
		998	}
		999	}
		1000
		1001	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
		1002	out_set_cpu:
		1003	return wake_idle(new_cpu, p);
		1004	}
		1005	#endif /* CONFIG_SMP */
		1006
		1007
		1008	/*
864	* Preempt the current task with a newly woken task if needed:	1009	* Preempt the current task with a newly woken task if needed:
865	*/	1010	*/
866	static void check_preempt_wakeup(struct rq rq, struct task_struct p)	1011	static void check_preempt_wakeup(struct rq rq, struct task_struct p)
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
1153	.enqueue_task = enqueue_task_fair,	1298	.enqueue_task = enqueue_task_fair,
1154	.dequeue_task = dequeue_task_fair,	1299	.dequeue_task = dequeue_task_fair,
1155	.yield_task = yield_task_fair,	1300	.yield_task = yield_task_fair,
		1301	#ifdef CONFIG_SMP
		1302	.select_task_rq = select_task_rq_fair,
		1303	#endif /* CONFIG_SMP */
1156		1304
1157	.check_preempt_curr = check_preempt_wakeup,	1305	.check_preempt_curr = check_preempt_wakeup,
1158		1306