aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2005-06-25 17:57:13 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:41 -0400
commit7897986bad8f6cd50d6149345aca7f6480f49464 (patch)
tree10a5e08e004ae685aaab6823a3774803455b7704
parent99b61ccf0bf0e9a85823d39a5db6a1519caeb13d (diff)
[PATCH] sched: balance timers
Do CPU load averaging over a number of different intervals. Allow each interval to be chosen by sending a parameter to source_load and target_load. 0 is instantaneous, idx > 0 returns a decaying average with the most recent sample weighted at 2^(idx-1). To a maximum of 3 (could be easily increased). So generally a higher number will result in more conservative balancing. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/asm-i386/topology.h4
-rw-r--r--include/asm-x86_64/topology.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/topology.h8
-rw-r--r--kernel/sched.c138
5 files changed, 95 insertions, 65 deletions
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 6d0f67507b21..0055fbfeec7b 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
74 .imbalance_pct = 125, \ 74 .imbalance_pct = 125, \
75 .cache_hot_time = (10*1000000), \ 75 .cache_hot_time = (10*1000000), \
76 .cache_nice_tries = 1, \ 76 .cache_nice_tries = 1, \
77 .busy_idx = 3, \
78 .idle_idx = 1, \
79 .newidle_idx = 2, \
80 .wake_idx = 1, \
77 .per_cpu_gain = 100, \ 81 .per_cpu_gain = 100, \
78 .flags = SD_LOAD_BALANCE \ 82 .flags = SD_LOAD_BALANCE \
79 | SD_BALANCE_EXEC \ 83 | SD_BALANCE_EXEC \
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 8f77e9f6bc23..fe8d80a15751 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
39 .busy_factor = 32, \ 39 .busy_factor = 32, \
40 .imbalance_pct = 125, \ 40 .imbalance_pct = 125, \
41 .cache_hot_time = (10*1000000), \ 41 .cache_hot_time = (10*1000000), \
42 .cache_nice_tries = 1, \ 42 .cache_nice_tries = 2, \
43 .busy_idx = 3, \
44 .idle_idx = 2, \
45 .newidle_idx = 1, \
46 .wake_idx = 1, \
43 .per_cpu_gain = 100, \ 47 .per_cpu_gain = 100, \
44 .flags = SD_LOAD_BALANCE \ 48 .flags = SD_LOAD_BALANCE \
45 | SD_BALANCE_NEWIDLE \ 49 | SD_BALANCE_NEWIDLE \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b0444..664981ac1fb6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -488,6 +488,10 @@ struct sched_domain {
488 unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ 488 unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
489 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 489 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
490 unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ 490 unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
491 unsigned int busy_idx;
492 unsigned int idle_idx;
493 unsigned int newidle_idx;
494 unsigned int wake_idx;
491 int flags; /* See SD_* */ 495 int flags; /* See SD_* */
492 496
493 /* Runtime fields. */ 497 /* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d70e8972c67f..ae9c2216dfa6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -89,6 +89,10 @@
89 .cache_hot_time = 0, \ 89 .cache_hot_time = 0, \
90 .cache_nice_tries = 0, \ 90 .cache_nice_tries = 0, \
91 .per_cpu_gain = 25, \ 91 .per_cpu_gain = 25, \
92 .busy_idx = 0, \
93 .idle_idx = 0, \
94 .newidle_idx = 0, \
95 .wake_idx = 0, \
92 .flags = SD_LOAD_BALANCE \ 96 .flags = SD_LOAD_BALANCE \
93 | SD_BALANCE_NEWIDLE \ 97 | SD_BALANCE_NEWIDLE \
94 | SD_BALANCE_EXEC \ 98 | SD_BALANCE_EXEC \
@@ -115,6 +119,10 @@
115 .cache_hot_time = (5*1000000/2), \ 119 .cache_hot_time = (5*1000000/2), \
116 .cache_nice_tries = 1, \ 120 .cache_nice_tries = 1, \
117 .per_cpu_gain = 100, \ 121 .per_cpu_gain = 100, \
122 .busy_idx = 2, \
123 .idle_idx = 0, \
124 .newidle_idx = 1, \
125 .wake_idx = 1, \
118 .flags = SD_LOAD_BALANCE \ 126 .flags = SD_LOAD_BALANCE \
119 | SD_BALANCE_NEWIDLE \ 127 | SD_BALANCE_NEWIDLE \
120 | SD_BALANCE_EXEC \ 128 | SD_BALANCE_EXEC \
diff --git a/kernel/sched.c b/kernel/sched.c
index f665de34ed82..b597b07e7911 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 886 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 887 * balance conservatively.
888 */ 888 */
889static inline unsigned long source_load(int cpu) 889static inline unsigned long source_load(int cpu, int type)
890{ 890{
891 runqueue_t *rq = cpu_rq(cpu); 891 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
893 if (type == 0)
894 return load_now;
893 895
894 return min(rq->cpu_load, load_now); 896 return min(rq->cpu_load[type-1], load_now);
895} 897}
896 898
897/* 899/*
898 * Return a high guess at the load of a migration-target cpu 900 * Return a high guess at the load of a migration-target cpu
899 */ 901 */
900static inline unsigned long target_load(int cpu) 902static inline unsigned long target_load(int cpu, int type)
901{ 903{
902 runqueue_t *rq = cpu_rq(cpu); 904 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 905 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
906 if (type == 0)
907 return load_now;
904 908
905 return max(rq->cpu_load, load_now); 909 return max(rq->cpu_load[type-1], load_now);
906} 910}
907 911
908#endif 912#endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 971 runqueue_t *rq;
968#ifdef CONFIG_SMP 972#ifdef CONFIG_SMP
969 unsigned long load, this_load; 973 unsigned long load, this_load;
970 struct sched_domain *sd; 974 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 975 int new_cpu;
972#endif 976#endif
973 977
@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 990 if (unlikely(task_running(rq, p)))
987 goto out_activate; 991 goto out_activate;
988 992
989#ifdef CONFIG_SCHEDSTATS 993 new_cpu = cpu;
994
990 schedstat_inc(rq, ttwu_cnt); 995 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 996 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 997 schedstat_inc(rq, ttwu_local);
993 } else { 998 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 999 }
995 if (cpu_isset(cpu, sd->span)) { 1000
996 schedstat_inc(sd, ttwu_wake_remote); 1001 for_each_domain(this_cpu, sd) {
997 break; 1002 if (cpu_isset(cpu, sd->span)) {
998 } 1003 schedstat_inc(sd, ttwu_wake_remote);
1004 this_sd = sd;
1005 break;
999 } 1006 }
1000 } 1007 }
1001#endif
1002 1008
1003 new_cpu = cpu; 1009 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1010 goto out_set_cpu;
1006 1011
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1012 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1013 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1014 */
1014 if (sync) 1015 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1016 int idx = this_sd->wake_idx;
1016 1017 unsigned int imbalance;
1017 /* Don't pull the task off an idle CPU to a busy one */
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1018
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1019 load = source_load(cpu, idx);
1020 this_load = target_load(this_cpu, idx);
1022 1021
1023 /*
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /* 1022 /*
1030 * Start passive balancing when half the imbalance_pct 1023 * If sync wakeup then subtract the (maximum possible) effect of
1031 * limit is reached. 1024 * the currently running task from the load of the current CPU:
1032 */ 1025 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; 1026 if (sync)
1027 this_load -= SCHED_LOAD_SCALE;
1028
1029 /* Don't pull the task off an idle CPU to a busy one */
1030 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1031 goto out_set_cpu;
1034 1032
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1033 new_cpu = this_cpu; /* Wake to this CPU if we can */
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1034
1035 if ((this_sd->flags & SD_WAKE_AFFINE) &&
1036 !task_hot(p, rq->timestamp_last_tick, this_sd)) {
1037 /* 1037 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1038 * This domain has SD_WAKE_AFFINE and p is cache cold
1039 * in this domain. 1039 * in this domain.
1040 */ 1040 */
1041 if (cpu_isset(cpu, sd->span)) { 1041 schedstat_inc(this_sd, ttwu_move_affine);
1042 schedstat_inc(sd, ttwu_move_affine); 1042 goto out_set_cpu;
1043 goto out_set_cpu; 1043 } else if ((this_sd->flags & SD_WAKE_BALANCE) &&
1044 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1046 imbalance*this_load <= 100*load) { 1044 imbalance*this_load <= 100*load) {
1047 /* 1045 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1046 * This domain has SD_WAKE_BALANCE and there is
1049 * an imbalance. 1047 * an imbalance.
1050 */ 1048 */
1051 if (cpu_isset(cpu, sd->span)) { 1049 schedstat_inc(this_sd, ttwu_move_balance);
1052 schedstat_inc(sd, ttwu_move_balance); 1050 goto out_set_cpu;
1053 goto out_set_cpu;
1054 }
1055 } 1051 }
1056 } 1052 }
1057 1053
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1509 cpus_and(mask, sd->span, p->cpus_allowed); 1505 cpus_and(mask, sd->span, p->cpus_allowed);
1510 1506
1511 for_each_cpu_mask(i, mask) { 1507 for_each_cpu_mask(i, mask) {
1512 load = target_load(i); 1508 load = target_load(i, sd->wake_idx);
1513 1509
1514 if (load < min_load) { 1510 if (load < min_load) {
1515 min_cpu = i; 1511 min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1522 } 1518 }
1523 1519
1524 /* add +1 to account for the new task */ 1520 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; 1521 this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
1526 1522
1527 /* 1523 /*
1528 * Would with the addition of the new task to the 1524 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1767{ 1763{
1768 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1764 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1769 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1765 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1766 int load_idx;
1770 1767
1771 max_load = this_load = total_load = total_pwr = 0; 1768 max_load = this_load = total_load = total_pwr = 0;
1769 if (idle == NOT_IDLE)
1770 load_idx = sd->busy_idx;
1771 else if (idle == NEWLY_IDLE)
1772 load_idx = sd->newidle_idx;
1773 else
1774 load_idx = sd->idle_idx;
1772 1775
1773 do { 1776 do {
1774 unsigned long load; 1777 unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1783 for_each_cpu_mask(i, group->cpumask) { 1786 for_each_cpu_mask(i, group->cpumask) {
1784 /* Bias balancing toward cpus of our domain */ 1787 /* Bias balancing toward cpus of our domain */
1785 if (local_group) 1788 if (local_group)
1786 load = target_load(i); 1789 load = target_load(i, load_idx);
1787 else 1790 else
1788 load = source_load(i); 1791 load = source_load(i, load_idx);
1789 1792
1790 avg_load += load; 1793 avg_load += load;
1791 } 1794 }
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1895 int i; 1898 int i;
1896 1899
1897 for_each_cpu_mask(i, group->cpumask) { 1900 for_each_cpu_mask(i, group->cpumask) {
1898 load = source_load(i); 1901 load = source_load(i, 0);
1899 1902
1900 if (load > max_load) { 1903 if (load > max_load) {
1901 max_load = load; 1904 max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2150 unsigned long old_load, this_load; 2153 unsigned long old_load, this_load;
2151 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2154 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2152 struct sched_domain *sd; 2155 struct sched_domain *sd;
2156 int i;
2153 2157
2154 /* Update our load */
2155 old_load = this_rq->cpu_load;
2156 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2158 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2157 /* 2159 /* Update our load */
2158 * Round up the averaging division if load is increasing. This 2160 for (i = 0; i < 3; i++) {
2159 * prevents us from getting stuck on 9 if the load is 10, for 2161 unsigned long new_load = this_load;
2160 * example. 2162 int scale = 1 << i;
2161 */ 2163 old_load = this_rq->cpu_load[i];
2162 if (this_load > old_load) 2164 /*
2163 old_load++; 2165 * Round up the averaging division if load is increasing. This
2164 this_rq->cpu_load = (old_load + this_load) / 2; 2166 * prevents us from getting stuck on 9 if the load is 10, for
2167 * example.
2168 */
2169 if (new_load > old_load)
2170 new_load += scale-1;
2171 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2172 }
2165 2173
2166 for_each_domain(this_cpu, sd) { 2174 for_each_domain(this_cpu, sd) {
2167 unsigned long interval; 2175 unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)
4921 4929
4922 rq = cpu_rq(i); 4930 rq = cpu_rq(i);
4923 spin_lock_init(&rq->lock); 4931 spin_lock_init(&rq->lock);
4932 rq->nr_running = 0;
4924 rq->active = rq->arrays; 4933 rq->active = rq->arrays;
4925 rq->expired = rq->arrays + 1; 4934 rq->expired = rq->arrays + 1;
4926 rq->best_expired_prio = MAX_PRIO; 4935 rq->best_expired_prio = MAX_PRIO;
4927 4936
4928#ifdef CONFIG_SMP 4937#ifdef CONFIG_SMP
4929 rq->sd = &sched_domain_dummy; 4938 rq->sd = &sched_domain_dummy;
4930 rq->cpu_load = 0; 4939 for (j = 1; j < 3; j++)
4940 rq->cpu_load[j] = 0;
4931 rq->active_balance = 0; 4941 rq->active_balance = 0;
4932 rq->push_cpu = 0; 4942 rq->push_cpu = 0;
4933 rq->migration_thread = NULL; 4943 rq->migration_thread = NULL;