diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-06-27 07:41:28 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-06-27 08:31:40 -0400 |
commit | 408ed066b11cf9ee4536573b4269ee3613bd735e (patch) | |
tree | 3b1367dafc96835c2fcbf289a7af808eb2efa605 /kernel | |
parent | bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 (diff) |
sched: hierarchical load vs find_busiest_group
find_busiest_group() has some assumptions about task weight being in the
NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this
by replacing it with the average task weight, which will adapt the situation.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 26 |
1 files changed, 23 insertions, 3 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 6a6b0139eb32..5e2aa394a812 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3050 | max_load = this_load = total_load = total_pwr = 0; | 3050 | max_load = this_load = total_load = total_pwr = 0; |
3051 | busiest_load_per_task = busiest_nr_running = 0; | 3051 | busiest_load_per_task = busiest_nr_running = 0; |
3052 | this_load_per_task = this_nr_running = 0; | 3052 | this_load_per_task = this_nr_running = 0; |
3053 | |||
3053 | if (idle == CPU_NOT_IDLE) | 3054 | if (idle == CPU_NOT_IDLE) |
3054 | load_idx = sd->busy_idx; | 3055 | load_idx = sd->busy_idx; |
3055 | else if (idle == CPU_NEWLY_IDLE) | 3056 | else if (idle == CPU_NEWLY_IDLE) |
@@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3064 | int __group_imb = 0; | 3065 | int __group_imb = 0; |
3065 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3066 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3066 | unsigned long sum_nr_running, sum_weighted_load; | 3067 | unsigned long sum_nr_running, sum_weighted_load; |
3068 | unsigned long sum_avg_load_per_task; | ||
3069 | unsigned long avg_load_per_task; | ||
3067 | 3070 | ||
3068 | local_group = cpu_isset(this_cpu, group->cpumask); | 3071 | local_group = cpu_isset(this_cpu, group->cpumask); |
3069 | 3072 | ||
@@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3072 | 3075 | ||
3073 | /* Tally up the load of all CPUs in the group */ | 3076 | /* Tally up the load of all CPUs in the group */ |
3074 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3077 | sum_weighted_load = sum_nr_running = avg_load = 0; |
3078 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3079 | |||
3075 | max_cpu_load = 0; | 3080 | max_cpu_load = 0; |
3076 | min_cpu_load = ~0UL; | 3081 | min_cpu_load = ~0UL; |
3077 | 3082 | ||
@@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3105 | avg_load += load; | 3110 | avg_load += load; |
3106 | sum_nr_running += rq->nr_running; | 3111 | sum_nr_running += rq->nr_running; |
3107 | sum_weighted_load += weighted_cpuload(i); | 3112 | sum_weighted_load += weighted_cpuload(i); |
3113 | |||
3114 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3108 | } | 3115 | } |
3109 | 3116 | ||
3110 | /* | 3117 | /* |
@@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3126 | avg_load = sg_div_cpu_power(group, | 3133 | avg_load = sg_div_cpu_power(group, |
3127 | avg_load * SCHED_LOAD_SCALE); | 3134 | avg_load * SCHED_LOAD_SCALE); |
3128 | 3135 | ||
3129 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3136 | |
3137 | /* | ||
3138 | * Consider the group unbalanced when the imbalance is larger | ||
3139 | * than the average weight of two tasks. | ||
3140 | * | ||
3141 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3142 | * might not be a suitable number - should we keep a | ||
3143 | * normalized nr_running number somewhere that negates | ||
3144 | * the hierarchy? | ||
3145 | */ | ||
3146 | avg_load_per_task = sg_div_cpu_power(group, | ||
3147 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
3148 | |||
3149 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3130 | __group_imb = 1; | 3150 | __group_imb = 1; |
3131 | 3151 | ||
3132 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3152 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
@@ -3267,9 +3287,9 @@ small_imbalance: | |||
3267 | if (busiest_load_per_task > this_load_per_task) | 3287 | if (busiest_load_per_task > this_load_per_task) |
3268 | imbn = 1; | 3288 | imbn = 1; |
3269 | } else | 3289 | } else |
3270 | this_load_per_task = SCHED_LOAD_SCALE; | 3290 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3271 | 3291 | ||
3272 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3292 | if (max_load - this_load + 2*busiest_load_per_task >= |
3273 | busiest_load_per_task * imbn) { | 3293 | busiest_load_per_task * imbn) { |
3274 | *imbalance = busiest_load_per_task; | 3294 | *imbalance = busiest_load_per_task; |
3275 | return busiest; | 3295 | return busiest; |