aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-06-27 07:41:28 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-27 08:31:40 -0400
commit408ed066b11cf9ee4536573b4269ee3613bd735e (patch)
tree3b1367dafc96835c2fcbf289a7af808eb2efa605
parentbb3469ac9b50f14ad6eba129ca0ad4fd033097a0 (diff)
sched: hierarchical load vs find_busiest_group
find_busiest_group() has some assumptions about task weight being in the NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this by replacing it with the average task weight, which will adapt the situation. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c26
1 files changed, 23 insertions, 3 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a6b0139eb32..5e2aa394a812 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3050 max_load = this_load = total_load = total_pwr = 0; 3050 max_load = this_load = total_load = total_pwr = 0;
3051 busiest_load_per_task = busiest_nr_running = 0; 3051 busiest_load_per_task = busiest_nr_running = 0;
3052 this_load_per_task = this_nr_running = 0; 3052 this_load_per_task = this_nr_running = 0;
3053
3053 if (idle == CPU_NOT_IDLE) 3054 if (idle == CPU_NOT_IDLE)
3054 load_idx = sd->busy_idx; 3055 load_idx = sd->busy_idx;
3055 else if (idle == CPU_NEWLY_IDLE) 3056 else if (idle == CPU_NEWLY_IDLE)
@@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3064 int __group_imb = 0; 3065 int __group_imb = 0;
3065 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3066 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3066 unsigned long sum_nr_running, sum_weighted_load; 3067 unsigned long sum_nr_running, sum_weighted_load;
3068 unsigned long sum_avg_load_per_task;
3069 unsigned long avg_load_per_task;
3067 3070
3068 local_group = cpu_isset(this_cpu, group->cpumask); 3071 local_group = cpu_isset(this_cpu, group->cpumask);
3069 3072
@@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3072 3075
3073 /* Tally up the load of all CPUs in the group */ 3076 /* Tally up the load of all CPUs in the group */
3074 sum_weighted_load = sum_nr_running = avg_load = 0; 3077 sum_weighted_load = sum_nr_running = avg_load = 0;
3078 sum_avg_load_per_task = avg_load_per_task = 0;
3079
3075 max_cpu_load = 0; 3080 max_cpu_load = 0;
3076 min_cpu_load = ~0UL; 3081 min_cpu_load = ~0UL;
3077 3082
@@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3105 avg_load += load; 3110 avg_load += load;
3106 sum_nr_running += rq->nr_running; 3111 sum_nr_running += rq->nr_running;
3107 sum_weighted_load += weighted_cpuload(i); 3112 sum_weighted_load += weighted_cpuload(i);
3113
3114 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3108 } 3115 }
3109 3116
3110 /* 3117 /*
@@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3126 avg_load = sg_div_cpu_power(group, 3133 avg_load = sg_div_cpu_power(group,
3127 avg_load * SCHED_LOAD_SCALE); 3134 avg_load * SCHED_LOAD_SCALE);
3128 3135
3129 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3136
3137 /*
3138 * Consider the group unbalanced when the imbalance is larger
3139 * than the average weight of two tasks.
3140 *
3141 * APZ: with cgroup the avg task weight can vary wildly and
3142 * might not be a suitable number - should we keep a
3143 * normalized nr_running number somewhere that negates
3144 * the hierarchy?
3145 */
3146 avg_load_per_task = sg_div_cpu_power(group,
3147 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3148
3149 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3130 __group_imb = 1; 3150 __group_imb = 1;
3131 3151
3132 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3152 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3267,9 +3287,9 @@ small_imbalance:
3267 if (busiest_load_per_task > this_load_per_task) 3287 if (busiest_load_per_task > this_load_per_task)
3268 imbn = 1; 3288 imbn = 1;
3269 } else 3289 } else
3270 this_load_per_task = SCHED_LOAD_SCALE; 3290 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3271 3291
3272 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3292 if (max_load - this_load + 2*busiest_load_per_task >=
3273 busiest_load_per_task * imbn) { 3293 busiest_load_per_task * imbn) {
3274 *imbalance = busiest_load_per_task; 3294 *imbalance = busiest_load_per_task;
3275 return busiest; 3295 return busiest;