diff options
author | Gregory Haskins <ghaskins@novell.com> | 2008-01-25 15:08:09 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-01-25 15:08:09 -0500 |
commit | e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c (patch) | |
tree | 078940540641a59aaf199695bfc6de3f062a987b /kernel | |
parent | 697f0a487f294e634a342764472b79375bb3158a (diff) |
sched: de-SCHED_OTHER-ize the RT path
The current wake-up code path tries to determine if it can optimize the
wake-up to "this_cpu" by computing load calculations. The problem is that
these calculations are only relevant to SCHED_OTHER tasks where load is king.
For RT tasks, priority is king. So the load calculation is completely wasted
bandwidth.
Therefore, we create a new sched_class interface to help with
pre-wakeup routing decisions and move the load calculation as a function
of CFS task's class.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 167 | ||||
-rw-r--r-- | kernel/sched_fair.c | 148 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 9 | ||||
-rw-r--r-- | kernel/sched_rt.c | 10 |
4 files changed, 194 insertions, 140 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 66e99b419b31..3344ba776b97 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
960 | update_load_sub(&rq->load, load); | 960 | update_load_sub(&rq->load, load); |
961 | } | 961 | } |
962 | 962 | ||
963 | #ifdef CONFIG_SMP | ||
964 | static unsigned long source_load(int cpu, int type); | ||
965 | static unsigned long target_load(int cpu, int type); | ||
966 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
967 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
968 | #endif /* CONFIG_SMP */ | ||
969 | |||
963 | #include "sched_stats.h" | 970 | #include "sched_stats.h" |
964 | #include "sched_idletask.c" | 971 | #include "sched_idletask.c" |
965 | #include "sched_fair.c" | 972 | #include "sched_fair.c" |
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1118 | /* | 1125 | /* |
1119 | * Is this task likely cache-hot: | 1126 | * Is this task likely cache-hot: |
1120 | */ | 1127 | */ |
1121 | static inline int | 1128 | static int |
1122 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1129 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1123 | { | 1130 | { |
1124 | s64 delta; | 1131 | s64 delta; |
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type) | |||
1343 | /* | 1350 | /* |
1344 | * Return the average load per task on the cpu's run queue | 1351 | * Return the average load per task on the cpu's run queue |
1345 | */ | 1352 | */ |
1346 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1353 | static unsigned long cpu_avg_load_per_task(int cpu) |
1347 | { | 1354 | { |
1348 | struct rq *rq = cpu_rq(cpu); | 1355 | struct rq *rq = cpu_rq(cpu); |
1349 | unsigned long total = weighted_cpuload(cpu); | 1356 | unsigned long total = weighted_cpuload(cpu); |
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1500 | 1507 | ||
1501 | #endif /* CONFIG_SMP */ | 1508 | #endif /* CONFIG_SMP */ |
1502 | 1509 | ||
1503 | /* | ||
1504 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1505 | * not idle and an idle cpu is available. The span of cpus to | ||
1506 | * search starts with cpus closest then further out as needed, | ||
1507 | * so we always favor a closer, idle cpu. | ||
1508 | * | ||
1509 | * Returns the CPU we should wake onto. | ||
1510 | */ | ||
1511 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1512 | static int wake_idle(int cpu, struct task_struct *p) | ||
1513 | { | ||
1514 | cpumask_t tmp; | ||
1515 | struct sched_domain *sd; | ||
1516 | int i; | ||
1517 | |||
1518 | /* | ||
1519 | * If it is idle, then it is the best cpu to run this task. | ||
1520 | * | ||
1521 | * This cpu is also the best, if it has more than one task already. | ||
1522 | * Siblings must be also busy(in most cases) as they didn't already | ||
1523 | * pickup the extra load from this cpu and hence we need not check | ||
1524 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1525 | * penalities associated with that. | ||
1526 | */ | ||
1527 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1528 | return cpu; | ||
1529 | |||
1530 | for_each_domain(cpu, sd) { | ||
1531 | if (sd->flags & SD_WAKE_IDLE) { | ||
1532 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1533 | for_each_cpu_mask(i, tmp) { | ||
1534 | if (idle_cpu(i)) { | ||
1535 | if (i != task_cpu(p)) { | ||
1536 | schedstat_inc(p, | ||
1537 | se.nr_wakeups_idle); | ||
1538 | } | ||
1539 | return i; | ||
1540 | } | ||
1541 | } | ||
1542 | } else { | ||
1543 | break; | ||
1544 | } | ||
1545 | } | ||
1546 | return cpu; | ||
1547 | } | ||
1548 | #else | ||
1549 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1550 | { | ||
1551 | return cpu; | ||
1552 | } | ||
1553 | #endif | ||
1554 | |||
1555 | /*** | 1510 | /*** |
1556 | * try_to_wake_up - wake up a thread | 1511 | * try_to_wake_up - wake up a thread |
1557 | * @p: the to-be-woken-up thread | 1512 | * @p: the to-be-woken-up thread |
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1573 | long old_state; | 1528 | long old_state; |
1574 | struct rq *rq; | 1529 | struct rq *rq; |
1575 | #ifdef CONFIG_SMP | 1530 | #ifdef CONFIG_SMP |
1576 | struct sched_domain *sd, *this_sd = NULL; | ||
1577 | unsigned long load, this_load; | ||
1578 | int new_cpu; | 1531 | int new_cpu; |
1579 | #endif | 1532 | #endif |
1580 | 1533 | ||
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1594 | if (unlikely(task_running(rq, p))) | 1547 | if (unlikely(task_running(rq, p))) |
1595 | goto out_activate; | 1548 | goto out_activate; |
1596 | 1549 | ||
1597 | new_cpu = cpu; | 1550 | new_cpu = p->sched_class->select_task_rq(p, sync); |
1598 | |||
1599 | schedstat_inc(rq, ttwu_count); | ||
1600 | if (cpu == this_cpu) { | ||
1601 | schedstat_inc(rq, ttwu_local); | ||
1602 | goto out_set_cpu; | ||
1603 | } | ||
1604 | |||
1605 | for_each_domain(this_cpu, sd) { | ||
1606 | if (cpu_isset(cpu, sd->span)) { | ||
1607 | schedstat_inc(sd, ttwu_wake_remote); | ||
1608 | this_sd = sd; | ||
1609 | break; | ||
1610 | } | ||
1611 | } | ||
1612 | |||
1613 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1614 | goto out_set_cpu; | ||
1615 | |||
1616 | /* | ||
1617 | * Check for affine wakeup and passive balancing possibilities. | ||
1618 | */ | ||
1619 | if (this_sd) { | ||
1620 | int idx = this_sd->wake_idx; | ||
1621 | unsigned int imbalance; | ||
1622 | |||
1623 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1624 | |||
1625 | load = source_load(cpu, idx); | ||
1626 | this_load = target_load(this_cpu, idx); | ||
1627 | |||
1628 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1629 | |||
1630 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1631 | unsigned long tl = this_load; | ||
1632 | unsigned long tl_per_task; | ||
1633 | |||
1634 | /* | ||
1635 | * Attract cache-cold tasks on sync wakeups: | ||
1636 | */ | ||
1637 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1638 | goto out_set_cpu; | ||
1639 | |||
1640 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1641 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1642 | |||
1643 | /* | ||
1644 | * If sync wakeup then subtract the (maximum possible) | ||
1645 | * effect of the currently running task from the load | ||
1646 | * of the current CPU: | ||
1647 | */ | ||
1648 | if (sync) | ||
1649 | tl -= current->se.load.weight; | ||
1650 | |||
1651 | if ((tl <= load && | ||
1652 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1653 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1654 | /* | ||
1655 | * This domain has SD_WAKE_AFFINE and | ||
1656 | * p is cache cold in this domain, and | ||
1657 | * there is no bad imbalance. | ||
1658 | */ | ||
1659 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1660 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1661 | goto out_set_cpu; | ||
1662 | } | ||
1663 | } | ||
1664 | |||
1665 | /* | ||
1666 | * Start passive balancing when half the imbalance_pct | ||
1667 | * limit is reached. | ||
1668 | */ | ||
1669 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1670 | if (imbalance*this_load <= 100*load) { | ||
1671 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1672 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1673 | goto out_set_cpu; | ||
1674 | } | ||
1675 | } | ||
1676 | } | ||
1677 | |||
1678 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1679 | out_set_cpu: | ||
1680 | new_cpu = wake_idle(new_cpu, p); | ||
1681 | if (new_cpu != cpu) { | 1551 | if (new_cpu != cpu) { |
1682 | set_task_cpu(p, new_cpu); | 1552 | set_task_cpu(p, new_cpu); |
1683 | task_rq_unlock(rq, &flags); | 1553 | task_rq_unlock(rq, &flags); |
@@ -1693,6 +1563,23 @@ out_set_cpu: | |||
1693 | cpu = task_cpu(p); | 1563 | cpu = task_cpu(p); |
1694 | } | 1564 | } |
1695 | 1565 | ||
1566 | #ifdef CONFIG_SCHEDSTATS | ||
1567 | schedstat_inc(rq, ttwu_count); | ||
1568 | if (cpu == this_cpu) | ||
1569 | schedstat_inc(rq, ttwu_local); | ||
1570 | else { | ||
1571 | struct sched_domain *sd; | ||
1572 | for_each_domain(this_cpu, sd) { | ||
1573 | if (cpu_isset(cpu, sd->span)) { | ||
1574 | schedstat_inc(sd, ttwu_wake_remote); | ||
1575 | break; | ||
1576 | } | ||
1577 | } | ||
1578 | } | ||
1579 | |||
1580 | #endif | ||
1581 | |||
1582 | |||
1696 | out_activate: | 1583 | out_activate: |
1697 | #endif /* CONFIG_SMP */ | 1584 | #endif /* CONFIG_SMP */ |
1698 | schedstat_inc(p, se.nr_wakeups); | 1585 | schedstat_inc(p, se.nr_wakeups); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5c208e090ae4..f881fc5e035c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq) | |||
861 | } | 861 | } |
862 | 862 | ||
863 | /* | 863 | /* |
864 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
865 | * not idle and an idle cpu is available. The span of cpus to | ||
866 | * search starts with cpus closest then further out as needed, | ||
867 | * so we always favor a closer, idle cpu. | ||
868 | * | ||
869 | * Returns the CPU we should wake onto. | ||
870 | */ | ||
871 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
872 | static int wake_idle(int cpu, struct task_struct *p) | ||
873 | { | ||
874 | cpumask_t tmp; | ||
875 | struct sched_domain *sd; | ||
876 | int i; | ||
877 | |||
878 | /* | ||
879 | * If it is idle, then it is the best cpu to run this task. | ||
880 | * | ||
881 | * This cpu is also the best, if it has more than one task already. | ||
882 | * Siblings must be also busy(in most cases) as they didn't already | ||
883 | * pickup the extra load from this cpu and hence we need not check | ||
884 | * sibling runqueue info. This will avoid the checks and cache miss | ||
885 | * penalities associated with that. | ||
886 | */ | ||
887 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
888 | return cpu; | ||
889 | |||
890 | for_each_domain(cpu, sd) { | ||
891 | if (sd->flags & SD_WAKE_IDLE) { | ||
892 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
893 | for_each_cpu_mask(i, tmp) { | ||
894 | if (idle_cpu(i)) { | ||
895 | if (i != task_cpu(p)) { | ||
896 | schedstat_inc(p, | ||
897 | se.nr_wakeups_idle); | ||
898 | } | ||
899 | return i; | ||
900 | } | ||
901 | } | ||
902 | } else { | ||
903 | break; | ||
904 | } | ||
905 | } | ||
906 | return cpu; | ||
907 | } | ||
908 | #else | ||
909 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
910 | { | ||
911 | return cpu; | ||
912 | } | ||
913 | #endif | ||
914 | |||
915 | #ifdef CONFIG_SMP | ||
916 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
917 | { | ||
918 | int cpu, this_cpu; | ||
919 | struct rq *rq; | ||
920 | struct sched_domain *sd, *this_sd = NULL; | ||
921 | int new_cpu; | ||
922 | |||
923 | cpu = task_cpu(p); | ||
924 | rq = task_rq(p); | ||
925 | this_cpu = smp_processor_id(); | ||
926 | new_cpu = cpu; | ||
927 | |||
928 | for_each_domain(this_cpu, sd) { | ||
929 | if (cpu_isset(cpu, sd->span)) { | ||
930 | this_sd = sd; | ||
931 | break; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
936 | goto out_set_cpu; | ||
937 | |||
938 | /* | ||
939 | * Check for affine wakeup and passive balancing possibilities. | ||
940 | */ | ||
941 | if (this_sd) { | ||
942 | int idx = this_sd->wake_idx; | ||
943 | unsigned int imbalance; | ||
944 | unsigned long load, this_load; | ||
945 | |||
946 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
947 | |||
948 | load = source_load(cpu, idx); | ||
949 | this_load = target_load(this_cpu, idx); | ||
950 | |||
951 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
952 | |||
953 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
954 | unsigned long tl = this_load; | ||
955 | unsigned long tl_per_task; | ||
956 | |||
957 | /* | ||
958 | * Attract cache-cold tasks on sync wakeups: | ||
959 | */ | ||
960 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
961 | goto out_set_cpu; | ||
962 | |||
963 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
964 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
965 | |||
966 | /* | ||
967 | * If sync wakeup then subtract the (maximum possible) | ||
968 | * effect of the currently running task from the load | ||
969 | * of the current CPU: | ||
970 | */ | ||
971 | if (sync) | ||
972 | tl -= current->se.load.weight; | ||
973 | |||
974 | if ((tl <= load && | ||
975 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
976 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
977 | /* | ||
978 | * This domain has SD_WAKE_AFFINE and | ||
979 | * p is cache cold in this domain, and | ||
980 | * there is no bad imbalance. | ||
981 | */ | ||
982 | schedstat_inc(this_sd, ttwu_move_affine); | ||
983 | schedstat_inc(p, se.nr_wakeups_affine); | ||
984 | goto out_set_cpu; | ||
985 | } | ||
986 | } | ||
987 | |||
988 | /* | ||
989 | * Start passive balancing when half the imbalance_pct | ||
990 | * limit is reached. | ||
991 | */ | ||
992 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
993 | if (imbalance*this_load <= 100*load) { | ||
994 | schedstat_inc(this_sd, ttwu_move_balance); | ||
995 | schedstat_inc(p, se.nr_wakeups_passive); | ||
996 | goto out_set_cpu; | ||
997 | } | ||
998 | } | ||
999 | } | ||
1000 | |||
1001 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1002 | out_set_cpu: | ||
1003 | return wake_idle(new_cpu, p); | ||
1004 | } | ||
1005 | #endif /* CONFIG_SMP */ | ||
1006 | |||
1007 | |||
1008 | /* | ||
864 | * Preempt the current task with a newly woken task if needed: | 1009 | * Preempt the current task with a newly woken task if needed: |
865 | */ | 1010 | */ |
866 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1011 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = { | |||
1153 | .enqueue_task = enqueue_task_fair, | 1298 | .enqueue_task = enqueue_task_fair, |
1154 | .dequeue_task = dequeue_task_fair, | 1299 | .dequeue_task = dequeue_task_fair, |
1155 | .yield_task = yield_task_fair, | 1300 | .yield_task = yield_task_fair, |
1301 | #ifdef CONFIG_SMP | ||
1302 | .select_task_rq = select_task_rq_fair, | ||
1303 | #endif /* CONFIG_SMP */ | ||
1156 | 1304 | ||
1157 | .check_preempt_curr = check_preempt_wakeup, | 1305 | .check_preempt_curr = check_preempt_wakeup, |
1158 | 1306 | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..ca5374860aef 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -5,6 +5,12 @@ | |||
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | ||
9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
10 | { | ||
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
12 | } | ||
13 | #endif /* CONFIG_SMP */ | ||
8 | /* | 14 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 16 | */ |
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = { | |||
72 | 78 | ||
73 | /* dequeue is not valid, we print a debug message there: */ | 79 | /* dequeue is not valid, we print a debug message there: */ |
74 | .dequeue_task = dequeue_task_idle, | 80 | .dequeue_task = dequeue_task_idle, |
81 | #ifdef CONFIG_SMP | ||
82 | .select_task_rq = select_task_rq_idle, | ||
83 | #endif /* CONFIG_SMP */ | ||
75 | 84 | ||
76 | .check_preempt_curr = check_preempt_curr_idle, | 85 | .check_preempt_curr = check_preempt_curr_idle, |
77 | 86 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b788e35ffd3f..5de1aebdbd1b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq) | |||
150 | requeue_task_rt(rq, rq->curr); | 150 | requeue_task_rt(rq, rq->curr); |
151 | } | 151 | } |
152 | 152 | ||
153 | #ifdef CONFIG_SMP | ||
154 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
155 | { | ||
156 | return task_cpu(p); | ||
157 | } | ||
158 | #endif /* CONFIG_SMP */ | ||
159 | |||
153 | /* | 160 | /* |
154 | * Preempt the current task with a newly woken task if needed: | 161 | * Preempt the current task with a newly woken task if needed: |
155 | */ | 162 | */ |
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = { | |||
667 | .enqueue_task = enqueue_task_rt, | 674 | .enqueue_task = enqueue_task_rt, |
668 | .dequeue_task = dequeue_task_rt, | 675 | .dequeue_task = dequeue_task_rt, |
669 | .yield_task = yield_task_rt, | 676 | .yield_task = yield_task_rt, |
677 | #ifdef CONFIG_SMP | ||
678 | .select_task_rq = select_task_rq_rt, | ||
679 | #endif /* CONFIG_SMP */ | ||
670 | 680 | ||
671 | .check_preempt_curr = check_preempt_curr_rt, | 681 | .check_preempt_curr = check_preempt_curr_rt, |
672 | 682 | ||