sched: de-SCHED_OTHER-ize the RT path

The current wake-up code path tries to determine if it can optimize the wake-up to "this_cpu" by computing load calculations. The problem is that these calculations are only relevant to SCHED_OTHER tasks where load is king. For RT tasks, priority is king. So the load calculation is completely wasted bandwidth. Therefore, we create a new sched_class interface to help with pre-wakeup routing decisions and move the load calculation as a function of CFS task's class. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Gregory Haskins <ghaskins@novell.com> 2008-01-25 15:08:09 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:09 -0500
commit: e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree: 078940540641a59aaf199695bfc6de3f062a987b /kernel/sched.c
parent: 697f0a487f294e634a342764472b79375bb3158a (diff)
1 files changed, 27 insertions, 140 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 66e99b419b31..3344ba776b97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_avg_load_per_task(int cpu);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#endif /* CONFIG_SMP */
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 /*
 * Is this task likely cache-hot:
 */
-static inline int
+static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
        s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
 /*
 * Return the average load per task on the cpu's run queue
 */
-static inline unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
 #endif /* CONFIG_SMP */
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-        cpumask_t tmp;
-        struct sched_domain *sd;
-        int i;
-        /*
-         * If it is idle, then it is the best cpu to run this task.
-         *
-         * This cpu is also the best, if it has more than one task already.
-         * Siblings must be also busy(in most cases) as they didn't already
-         * pickup the extra load from this cpu and hence we need not check
-         * sibling runqueue info. This will avoid the checks and cache miss
-         * penalities associated with that.
-         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
-                return cpu;
-        for_each_domain(cpu, sd) {
-                if (sd->flags & SD_WAKE_IDLE) {
-                        cpus_and(tmp, sd->span, p->cpus_allowed);
-                        for_each_cpu_mask(i, tmp) {
-                                if (idle_cpu(i)) {
-                                        if (i != task_cpu(p)) {
-                                                schedstat_inc(p,
-                                                        se.nr_wakeups_idle);
-                                        }
-                                        return i;
-                                }
-                        }
-                } else {
-                        break;
-                }
-        }
-        return cpu;
-}
-#else
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-        return cpu;
-}
-#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        long old_state;
        struct rq *rq;
 #ifdef CONFIG_SMP
-        struct sched_domain *sd, *this_sd = NULL;
-        unsigned long load, this_load;
        int new_cpu;
 #endif
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
-        new_cpu = cpu;
+        new_cpu = p->sched_class->select_task_rq(p, sync);
-        schedstat_inc(rq, ttwu_count);
-        if (cpu == this_cpu) {
-                schedstat_inc(rq, ttwu_local);
-                goto out_set_cpu;
-        }
-        for_each_domain(this_cpu, sd) {
-                if (cpu_isset(cpu, sd->span)) {
-                        schedstat_inc(sd, ttwu_wake_remote);
-                        this_sd = sd;
-                        break;
-                }
-        }
-        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-                goto out_set_cpu;
-        /*
-         * Check for affine wakeup and passive balancing possibilities.
-         */
-        if (this_sd) {
-                int idx = this_sd->wake_idx;
-                unsigned int imbalance;
-                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-                load = source_load(cpu, idx);
-                this_load = target_load(this_cpu, idx);
-                new_cpu = this_cpu; /* Wake to this CPU if we can */
-                if (this_sd->flags & SD_WAKE_AFFINE) {
-                        unsigned long tl = this_load;
-                        unsigned long tl_per_task;
-                        /*
-                         * Attract cache-cold tasks on sync wakeups:
-                         */
-                        if (sync && !task_hot(p, rq->clock, this_sd))
-                                goto out_set_cpu;
-                        schedstat_inc(p, se.nr_wakeups_affine_attempts);
-                        tl_per_task = cpu_avg_load_per_task(this_cpu);
-                        /*
-                         * If sync wakeup then subtract the (maximum possible)
-                         * effect of the currently running task from the load
-                         * of the current CPU:
-                         */
-                        if (sync)
-                                tl -= current->se.load.weight;
-                        if ((tl <= load &&
-                                tl + target_load(cpu, idx) <= tl_per_task) ||
-                               100*(tl + p->se.load.weight) <= imbalance*load) {
-                                /*
-                                 * This domain has SD_WAKE_AFFINE and
-                                 * p is cache cold in this domain, and
-                                 * there is no bad imbalance.
-                                 */
-                                schedstat_inc(this_sd, ttwu_move_affine);
-                                schedstat_inc(p, se.nr_wakeups_affine);
-                                goto out_set_cpu;
-                        }
-                }
-                /*
-                 * Start passive balancing when half the imbalance_pct
-                 * limit is reached.
-                 */
-                if (this_sd->flags & SD_WAKE_BALANCE) {
-                        if (imbalance*this_load <= 100*load) {
-                                schedstat_inc(this_sd, ttwu_move_balance);
-                                schedstat_inc(p, se.nr_wakeups_passive);
-                                goto out_set_cpu;
-                        }
-                }
-        }
-        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
-        new_cpu = wake_idle(new_cpu, p);
        if (new_cpu != cpu) {
                set_task_cpu(p, new_cpu);
                task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
                cpu = task_cpu(p);
        }
+#ifdef CONFIG_SCHEDSTATS
+        schedstat_inc(rq, ttwu_count);
+        if (cpu == this_cpu)
+                schedstat_inc(rq, ttwu_local);
+        else {
+                struct sched_domain *sd;
+                for_each_domain(this_cpu, sd) {
+                        if (cpu_isset(cpu, sd->span)) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+        }
+#endif
 out_activate:
 #endif /* CONFIG_SMP */
        schedstat_inc(p, se.nr_wakeups);
author	Gregory Haskins <ghaskins@novell.com>	2008-01-25 15:08:09 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:09 -0500
commit	e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree	078940540641a59aaf199695bfc6de3f062a987b /kernel/sched.c
parent	697f0a487f294e634a342764472b79375bb3158a (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 66e99b419b31..3344ba776b97 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
960	update_load_sub(&rq->load, load);	960	update_load_sub(&rq->load, load);
961	}	961	}
962		962
		963	#ifdef CONFIG_SMP
		964	static unsigned long source_load(int cpu, int type);
		965	static unsigned long target_load(int cpu, int type);
		966	static unsigned long cpu_avg_load_per_task(int cpu);
		967	static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
		968	#endif /* CONFIG_SMP */
		969
963	#include "sched_stats.h"	970	#include "sched_stats.h"
964	#include "sched_idletask.c"	971	#include "sched_idletask.c"
965	#include "sched_fair.c"	972	#include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1118	/*	1125	/*
1119	* Is this task likely cache-hot:	1126	* Is this task likely cache-hot:
1120	*/	1127	*/
1121	static inline int	1128	static int
1122	task_hot(struct task_struct p, u64 now, struct sched_domain sd)	1129	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
1123	{	1130	{
1124	s64 delta;	1131	s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
1343	/*	1350	/*
1344	* Return the average load per task on the cpu's run queue	1351	* Return the average load per task on the cpu's run queue
1345	*/	1352	*/
1346	static inline unsigned long cpu_avg_load_per_task(int cpu)	1353	static unsigned long cpu_avg_load_per_task(int cpu)
1347	{	1354	{
1348	struct rq *rq = cpu_rq(cpu);	1355	struct rq *rq = cpu_rq(cpu);
1349	unsigned long total = weighted_cpuload(cpu);	1356	unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
1500		1507
1501	#endif /* CONFIG_SMP */	1508	#endif /* CONFIG_SMP */
1502		1509
1503	/*
1504	* wake_idle() will wake a task on an idle cpu if task->cpu is
1505	* not idle and an idle cpu is available. The span of cpus to
1506	* search starts with cpus closest then further out as needed,
1507	* so we always favor a closer, idle cpu.
1508	*
1509	* Returns the CPU we should wake onto.
1510	*/
1511	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1512	static int wake_idle(int cpu, struct task_struct *p)
1513	{
1514	cpumask_t tmp;
1515	struct sched_domain *sd;
1516	int i;
1517
1518	/*
1519	* If it is idle, then it is the best cpu to run this task.
1520	*
1521	* This cpu is also the best, if it has more than one task already.
1522	* Siblings must be also busy(in most cases) as they didn't already
1523	* pickup the extra load from this cpu and hence we need not check
1524	* sibling runqueue info. This will avoid the checks and cache miss
1525	* penalities associated with that.
1526	*/
1527	if (idle_cpu(cpu) \|\| cpu_rq(cpu)->nr_running > 1)
1528	return cpu;
1529
1530	for_each_domain(cpu, sd) {
1531	if (sd->flags & SD_WAKE_IDLE) {
1532	cpus_and(tmp, sd->span, p->cpus_allowed);
1533	for_each_cpu_mask(i, tmp) {
1534	if (idle_cpu(i)) {
1535	if (i != task_cpu(p)) {
1536	schedstat_inc(p,
1537	se.nr_wakeups_idle);
1538	}
1539	return i;
1540	}
1541	}
1542	} else {
1543	break;
1544	}
1545	}
1546	return cpu;
1547	}
1548	#else
1549	static inline int wake_idle(int cpu, struct task_struct *p)
1550	{
1551	return cpu;
1552	}
1553	#endif
1554
1555	/***	1510	/***
1556	* try_to_wake_up - wake up a thread	1511	* try_to_wake_up - wake up a thread
1557	* @p: the to-be-woken-up thread	1512	* @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1573	long old_state;	1528	long old_state;
1574	struct rq *rq;	1529	struct rq *rq;
1575	#ifdef CONFIG_SMP	1530	#ifdef CONFIG_SMP
1576	struct sched_domain sd, this_sd = NULL;
1577	unsigned long load, this_load;
1578	int new_cpu;	1531	int new_cpu;
1579	#endif	1532	#endif
1580		1533
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1594	if (unlikely(task_running(rq, p)))	1547	if (unlikely(task_running(rq, p)))
1595	goto out_activate;	1548	goto out_activate;
1596		1549
1597	new_cpu = cpu;	1550	new_cpu = p->sched_class->select_task_rq(p, sync);
1598
1599	schedstat_inc(rq, ttwu_count);
1600	if (cpu == this_cpu) {
1601	schedstat_inc(rq, ttwu_local);
1602	goto out_set_cpu;
1603	}
1604
1605	for_each_domain(this_cpu, sd) {
1606	if (cpu_isset(cpu, sd->span)) {
1607	schedstat_inc(sd, ttwu_wake_remote);
1608	this_sd = sd;
1609	break;
1610	}
1611	}
1612
1613	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1614	goto out_set_cpu;
1615
1616	/*
1617	* Check for affine wakeup and passive balancing possibilities.
1618	*/
1619	if (this_sd) {
1620	int idx = this_sd->wake_idx;
1621	unsigned int imbalance;
1622
1623	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1624
1625	load = source_load(cpu, idx);
1626	this_load = target_load(this_cpu, idx);
1627
1628	new_cpu = this_cpu; /* Wake to this CPU if we can */
1629
1630	if (this_sd->flags & SD_WAKE_AFFINE) {
1631	unsigned long tl = this_load;
1632	unsigned long tl_per_task;
1633
1634	/*
1635	* Attract cache-cold tasks on sync wakeups:
1636	*/
1637	if (sync && !task_hot(p, rq->clock, this_sd))
1638	goto out_set_cpu;
1639
1640	schedstat_inc(p, se.nr_wakeups_affine_attempts);
1641	tl_per_task = cpu_avg_load_per_task(this_cpu);
1642
1643	/*
1644	* If sync wakeup then subtract the (maximum possible)
1645	* effect of the currently running task from the load
1646	* of the current CPU:
1647	*/
1648	if (sync)
1649	tl -= current->se.load.weight;
1650
1651	if ((tl <= load &&
1652	tl + target_load(cpu, idx) <= tl_per_task) \|\|
1653	100(tl + p->se.load.weight) <= imbalanceload) {
1654	/*
1655	* This domain has SD_WAKE_AFFINE and
1656	* p is cache cold in this domain, and
1657	* there is no bad imbalance.
1658	*/
1659	schedstat_inc(this_sd, ttwu_move_affine);
1660	schedstat_inc(p, se.nr_wakeups_affine);
1661	goto out_set_cpu;
1662	}
1663	}
1664
1665	/*
1666	* Start passive balancing when half the imbalance_pct
1667	* limit is reached.
1668	*/
1669	if (this_sd->flags & SD_WAKE_BALANCE) {
1670	if (imbalancethis_load <= 100load) {
1671	schedstat_inc(this_sd, ttwu_move_balance);
1672	schedstat_inc(p, se.nr_wakeups_passive);
1673	goto out_set_cpu;
1674	}
1675	}
1676	}
1677
1678	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1679	out_set_cpu:
1680	new_cpu = wake_idle(new_cpu, p);
1681	if (new_cpu != cpu) {	1551	if (new_cpu != cpu) {
1682	set_task_cpu(p, new_cpu);	1552	set_task_cpu(p, new_cpu);
1683	task_rq_unlock(rq, &flags);	1553	task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
1693	cpu = task_cpu(p);	1563	cpu = task_cpu(p);
1694	}	1564	}
1695		1565
		1566	#ifdef CONFIG_SCHEDSTATS
		1567	schedstat_inc(rq, ttwu_count);
		1568	if (cpu == this_cpu)
		1569	schedstat_inc(rq, ttwu_local);
		1570	else {
		1571	struct sched_domain *sd;
		1572	for_each_domain(this_cpu, sd) {
		1573	if (cpu_isset(cpu, sd->span)) {
		1574	schedstat_inc(sd, ttwu_wake_remote);
		1575	break;
		1576	}
		1577	}
		1578	}
		1579
		1580	#endif
		1581
		1582
1696	out_activate:	1583	out_activate:
1697	#endif /* CONFIG_SMP */	1584	#endif /* CONFIG_SMP */
1698	schedstat_inc(p, se.nr_wakeups);	1585	schedstat_inc(p, se.nr_wakeups);