aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGregory Haskins <ghaskins@novell.com>2008-01-25 15:08:09 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:09 -0500
commite7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch)
tree078940540641a59aaf199695bfc6de3f062a987b /kernel
parent697f0a487f294e634a342764472b79375bb3158a (diff)
sched: de-SCHED_OTHER-ize the RT path
The current wake-up code path tries to determine if it can optimize the wake-up to "this_cpu" by computing load calculations. The problem is that these calculations are only relevant to SCHED_OTHER tasks where load is king. For RT tasks, priority is king. So the load calculation is completely wasted bandwidth. Therefore, we create a new sched_class interface to help with pre-wakeup routing decisions and move the load calculation as a function of CFS task's class. Signed-off-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c167
-rw-r--r--kernel/sched_fair.c148
-rw-r--r--kernel/sched_idletask.c9
-rw-r--r--kernel/sched_rt.c10
4 files changed, 194 insertions, 140 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 66e99b419b31..3344ba776b97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
960 update_load_sub(&rq->load, load); 960 update_load_sub(&rq->load, load);
961} 961}
962 962
963#ifdef CONFIG_SMP
964static unsigned long source_load(int cpu, int type);
965static unsigned long target_load(int cpu, int type);
966static unsigned long cpu_avg_load_per_task(int cpu);
967static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
968#endif /* CONFIG_SMP */
969
963#include "sched_stats.h" 970#include "sched_stats.h"
964#include "sched_idletask.c" 971#include "sched_idletask.c"
965#include "sched_fair.c" 972#include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1118/* 1125/*
1119 * Is this task likely cache-hot: 1126 * Is this task likely cache-hot:
1120 */ 1127 */
1121static inline int 1128static int
1122task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1129task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1123{ 1130{
1124 s64 delta; 1131 s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
1343/* 1350/*
1344 * Return the average load per task on the cpu's run queue 1351 * Return the average load per task on the cpu's run queue
1345 */ 1352 */
1346static inline unsigned long cpu_avg_load_per_task(int cpu) 1353static unsigned long cpu_avg_load_per_task(int cpu)
1347{ 1354{
1348 struct rq *rq = cpu_rq(cpu); 1355 struct rq *rq = cpu_rq(cpu);
1349 unsigned long total = weighted_cpuload(cpu); 1356 unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
1500 1507
1501#endif /* CONFIG_SMP */ 1508#endif /* CONFIG_SMP */
1502 1509
1503/*
1504 * wake_idle() will wake a task on an idle cpu if task->cpu is
1505 * not idle and an idle cpu is available. The span of cpus to
1506 * search starts with cpus closest then further out as needed,
1507 * so we always favor a closer, idle cpu.
1508 *
1509 * Returns the CPU we should wake onto.
1510 */
1511#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1512static int wake_idle(int cpu, struct task_struct *p)
1513{
1514 cpumask_t tmp;
1515 struct sched_domain *sd;
1516 int i;
1517
1518 /*
1519 * If it is idle, then it is the best cpu to run this task.
1520 *
1521 * This cpu is also the best, if it has more than one task already.
1522 * Siblings must be also busy(in most cases) as they didn't already
1523 * pickup the extra load from this cpu and hence we need not check
1524 * sibling runqueue info. This will avoid the checks and cache miss
1525 * penalities associated with that.
1526 */
1527 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1528 return cpu;
1529
1530 for_each_domain(cpu, sd) {
1531 if (sd->flags & SD_WAKE_IDLE) {
1532 cpus_and(tmp, sd->span, p->cpus_allowed);
1533 for_each_cpu_mask(i, tmp) {
1534 if (idle_cpu(i)) {
1535 if (i != task_cpu(p)) {
1536 schedstat_inc(p,
1537 se.nr_wakeups_idle);
1538 }
1539 return i;
1540 }
1541 }
1542 } else {
1543 break;
1544 }
1545 }
1546 return cpu;
1547}
1548#else
1549static inline int wake_idle(int cpu, struct task_struct *p)
1550{
1551 return cpu;
1552}
1553#endif
1554
1555/*** 1510/***
1556 * try_to_wake_up - wake up a thread 1511 * try_to_wake_up - wake up a thread
1557 * @p: the to-be-woken-up thread 1512 * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1573 long old_state; 1528 long old_state;
1574 struct rq *rq; 1529 struct rq *rq;
1575#ifdef CONFIG_SMP 1530#ifdef CONFIG_SMP
1576 struct sched_domain *sd, *this_sd = NULL;
1577 unsigned long load, this_load;
1578 int new_cpu; 1531 int new_cpu;
1579#endif 1532#endif
1580 1533
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1594 if (unlikely(task_running(rq, p))) 1547 if (unlikely(task_running(rq, p)))
1595 goto out_activate; 1548 goto out_activate;
1596 1549
1597 new_cpu = cpu; 1550 new_cpu = p->sched_class->select_task_rq(p, sync);
1598
1599 schedstat_inc(rq, ttwu_count);
1600 if (cpu == this_cpu) {
1601 schedstat_inc(rq, ttwu_local);
1602 goto out_set_cpu;
1603 }
1604
1605 for_each_domain(this_cpu, sd) {
1606 if (cpu_isset(cpu, sd->span)) {
1607 schedstat_inc(sd, ttwu_wake_remote);
1608 this_sd = sd;
1609 break;
1610 }
1611 }
1612
1613 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1614 goto out_set_cpu;
1615
1616 /*
1617 * Check for affine wakeup and passive balancing possibilities.
1618 */
1619 if (this_sd) {
1620 int idx = this_sd->wake_idx;
1621 unsigned int imbalance;
1622
1623 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1624
1625 load = source_load(cpu, idx);
1626 this_load = target_load(this_cpu, idx);
1627
1628 new_cpu = this_cpu; /* Wake to this CPU if we can */
1629
1630 if (this_sd->flags & SD_WAKE_AFFINE) {
1631 unsigned long tl = this_load;
1632 unsigned long tl_per_task;
1633
1634 /*
1635 * Attract cache-cold tasks on sync wakeups:
1636 */
1637 if (sync && !task_hot(p, rq->clock, this_sd))
1638 goto out_set_cpu;
1639
1640 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1641 tl_per_task = cpu_avg_load_per_task(this_cpu);
1642
1643 /*
1644 * If sync wakeup then subtract the (maximum possible)
1645 * effect of the currently running task from the load
1646 * of the current CPU:
1647 */
1648 if (sync)
1649 tl -= current->se.load.weight;
1650
1651 if ((tl <= load &&
1652 tl + target_load(cpu, idx) <= tl_per_task) ||
1653 100*(tl + p->se.load.weight) <= imbalance*load) {
1654 /*
1655 * This domain has SD_WAKE_AFFINE and
1656 * p is cache cold in this domain, and
1657 * there is no bad imbalance.
1658 */
1659 schedstat_inc(this_sd, ttwu_move_affine);
1660 schedstat_inc(p, se.nr_wakeups_affine);
1661 goto out_set_cpu;
1662 }
1663 }
1664
1665 /*
1666 * Start passive balancing when half the imbalance_pct
1667 * limit is reached.
1668 */
1669 if (this_sd->flags & SD_WAKE_BALANCE) {
1670 if (imbalance*this_load <= 100*load) {
1671 schedstat_inc(this_sd, ttwu_move_balance);
1672 schedstat_inc(p, se.nr_wakeups_passive);
1673 goto out_set_cpu;
1674 }
1675 }
1676 }
1677
1678 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1679out_set_cpu:
1680 new_cpu = wake_idle(new_cpu, p);
1681 if (new_cpu != cpu) { 1551 if (new_cpu != cpu) {
1682 set_task_cpu(p, new_cpu); 1552 set_task_cpu(p, new_cpu);
1683 task_rq_unlock(rq, &flags); 1553 task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
1693 cpu = task_cpu(p); 1563 cpu = task_cpu(p);
1694 } 1564 }
1695 1565
1566#ifdef CONFIG_SCHEDSTATS
1567 schedstat_inc(rq, ttwu_count);
1568 if (cpu == this_cpu)
1569 schedstat_inc(rq, ttwu_local);
1570 else {
1571 struct sched_domain *sd;
1572 for_each_domain(this_cpu, sd) {
1573 if (cpu_isset(cpu, sd->span)) {
1574 schedstat_inc(sd, ttwu_wake_remote);
1575 break;
1576 }
1577 }
1578 }
1579
1580#endif
1581
1582
1696out_activate: 1583out_activate:
1697#endif /* CONFIG_SMP */ 1584#endif /* CONFIG_SMP */
1698 schedstat_inc(p, se.nr_wakeups); 1585 schedstat_inc(p, se.nr_wakeups);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c208e090ae4..f881fc5e035c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq)
861} 861}
862 862
863/* 863/*
864 * wake_idle() will wake a task on an idle cpu if task->cpu is
865 * not idle and an idle cpu is available. The span of cpus to
866 * search starts with cpus closest then further out as needed,
867 * so we always favor a closer, idle cpu.
868 *
869 * Returns the CPU we should wake onto.
870 */
871#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
872static int wake_idle(int cpu, struct task_struct *p)
873{
874 cpumask_t tmp;
875 struct sched_domain *sd;
876 int i;
877
878 /*
879 * If it is idle, then it is the best cpu to run this task.
880 *
881 * This cpu is also the best, if it has more than one task already.
882 * Siblings must be also busy(in most cases) as they didn't already
883 * pickup the extra load from this cpu and hence we need not check
884 * sibling runqueue info. This will avoid the checks and cache miss
885 * penalities associated with that.
886 */
887 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
888 return cpu;
889
890 for_each_domain(cpu, sd) {
891 if (sd->flags & SD_WAKE_IDLE) {
892 cpus_and(tmp, sd->span, p->cpus_allowed);
893 for_each_cpu_mask(i, tmp) {
894 if (idle_cpu(i)) {
895 if (i != task_cpu(p)) {
896 schedstat_inc(p,
897 se.nr_wakeups_idle);
898 }
899 return i;
900 }
901 }
902 } else {
903 break;
904 }
905 }
906 return cpu;
907}
908#else
909static inline int wake_idle(int cpu, struct task_struct *p)
910{
911 return cpu;
912}
913#endif
914
915#ifdef CONFIG_SMP
916static int select_task_rq_fair(struct task_struct *p, int sync)
917{
918 int cpu, this_cpu;
919 struct rq *rq;
920 struct sched_domain *sd, *this_sd = NULL;
921 int new_cpu;
922
923 cpu = task_cpu(p);
924 rq = task_rq(p);
925 this_cpu = smp_processor_id();
926 new_cpu = cpu;
927
928 for_each_domain(this_cpu, sd) {
929 if (cpu_isset(cpu, sd->span)) {
930 this_sd = sd;
931 break;
932 }
933 }
934
935 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
936 goto out_set_cpu;
937
938 /*
939 * Check for affine wakeup and passive balancing possibilities.
940 */
941 if (this_sd) {
942 int idx = this_sd->wake_idx;
943 unsigned int imbalance;
944 unsigned long load, this_load;
945
946 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
947
948 load = source_load(cpu, idx);
949 this_load = target_load(this_cpu, idx);
950
951 new_cpu = this_cpu; /* Wake to this CPU if we can */
952
953 if (this_sd->flags & SD_WAKE_AFFINE) {
954 unsigned long tl = this_load;
955 unsigned long tl_per_task;
956
957 /*
958 * Attract cache-cold tasks on sync wakeups:
959 */
960 if (sync && !task_hot(p, rq->clock, this_sd))
961 goto out_set_cpu;
962
963 schedstat_inc(p, se.nr_wakeups_affine_attempts);
964 tl_per_task = cpu_avg_load_per_task(this_cpu);
965
966 /*
967 * If sync wakeup then subtract the (maximum possible)
968 * effect of the currently running task from the load
969 * of the current CPU:
970 */
971 if (sync)
972 tl -= current->se.load.weight;
973
974 if ((tl <= load &&
975 tl + target_load(cpu, idx) <= tl_per_task) ||
976 100*(tl + p->se.load.weight) <= imbalance*load) {
977 /*
978 * This domain has SD_WAKE_AFFINE and
979 * p is cache cold in this domain, and
980 * there is no bad imbalance.
981 */
982 schedstat_inc(this_sd, ttwu_move_affine);
983 schedstat_inc(p, se.nr_wakeups_affine);
984 goto out_set_cpu;
985 }
986 }
987
988 /*
989 * Start passive balancing when half the imbalance_pct
990 * limit is reached.
991 */
992 if (this_sd->flags & SD_WAKE_BALANCE) {
993 if (imbalance*this_load <= 100*load) {
994 schedstat_inc(this_sd, ttwu_move_balance);
995 schedstat_inc(p, se.nr_wakeups_passive);
996 goto out_set_cpu;
997 }
998 }
999 }
1000
1001 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1002out_set_cpu:
1003 return wake_idle(new_cpu, p);
1004}
1005#endif /* CONFIG_SMP */
1006
1007
1008/*
864 * Preempt the current task with a newly woken task if needed: 1009 * Preempt the current task with a newly woken task if needed:
865 */ 1010 */
866static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1011static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
1153 .enqueue_task = enqueue_task_fair, 1298 .enqueue_task = enqueue_task_fair,
1154 .dequeue_task = dequeue_task_fair, 1299 .dequeue_task = dequeue_task_fair,
1155 .yield_task = yield_task_fair, 1300 .yield_task = yield_task_fair,
1301#ifdef CONFIG_SMP
1302 .select_task_rq = select_task_rq_fair,
1303#endif /* CONFIG_SMP */
1156 1304
1157 .check_preempt_curr = check_preempt_wakeup, 1305 .check_preempt_curr = check_preempt_wakeup,
1158 1306
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..ca5374860aef 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync)
10{
11 return task_cpu(p); /* IDLE tasks as never migrated */
12}
13#endif /* CONFIG_SMP */
8/* 14/*
9 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
10 */ 16 */
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = {
72 78
73 /* dequeue is not valid, we print a debug message there: */ 79 /* dequeue is not valid, we print a debug message there: */
74 .dequeue_task = dequeue_task_idle, 80 .dequeue_task = dequeue_task_idle,
81#ifdef CONFIG_SMP
82 .select_task_rq = select_task_rq_idle,
83#endif /* CONFIG_SMP */
75 84
76 .check_preempt_curr = check_preempt_curr_idle, 85 .check_preempt_curr = check_preempt_curr_idle,
77 86
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b788e35ffd3f..5de1aebdbd1b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq)
150 requeue_task_rt(rq, rq->curr); 150 requeue_task_rt(rq, rq->curr);
151} 151}
152 152
153#ifdef CONFIG_SMP
154static int select_task_rq_rt(struct task_struct *p, int sync)
155{
156 return task_cpu(p);
157}
158#endif /* CONFIG_SMP */
159
153/* 160/*
154 * Preempt the current task with a newly woken task if needed: 161 * Preempt the current task with a newly woken task if needed:
155 */ 162 */
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = {
667 .enqueue_task = enqueue_task_rt, 674 .enqueue_task = enqueue_task_rt,
668 .dequeue_task = dequeue_task_rt, 675 .dequeue_task = dequeue_task_rt,
669 .yield_task = yield_task_rt, 676 .yield_task = yield_task_rt,
677#ifdef CONFIG_SMP
678 .select_task_rq = select_task_rq_rt,
679#endif /* CONFIG_SMP */
670 680
671 .check_preempt_curr = check_preempt_curr_rt, 681 .check_preempt_curr = check_preempt_curr_rt,
672 682