aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/sched.c175
1 files changed, 100 insertions, 75 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 109db122de50..1893d5562f5f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3237 3237
3238 return load_idx; 3238 return load_idx;
3239} 3239}
3240
3241
3242/**
3243 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3244 * @group: sched_group whose statistics are to be updated.
3245 * @this_cpu: Cpu for which load balance is currently performed.
3246 * @idle: Idle status of this_cpu
3247 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3248 * @sd_idle: Idle status of the sched_domain containing group.
3249 * @local_group: Does group contain this_cpu.
3250 * @cpus: Set of cpus considered for load balancing.
3251 * @balance: Should we balance.
3252 * @sgs: variable to hold the statistics for this group.
3253 */
3254static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3255 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3256 int local_group, const struct cpumask *cpus,
3257 int *balance, struct sg_lb_stats *sgs)
3258{
3259 unsigned long load, max_cpu_load, min_cpu_load;
3260 int i;
3261 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3262 unsigned long sum_avg_load_per_task;
3263 unsigned long avg_load_per_task;
3264
3265 if (local_group)
3266 balance_cpu = group_first_cpu(group);
3267
3268 /* Tally up the load of all CPUs in the group */
3269 sum_avg_load_per_task = avg_load_per_task = 0;
3270 max_cpu_load = 0;
3271 min_cpu_load = ~0UL;
3272
3273 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3274 struct rq *rq = cpu_rq(i);
3275
3276 if (*sd_idle && rq->nr_running)
3277 *sd_idle = 0;
3278
3279 /* Bias balancing toward cpus of our domain */
3280 if (local_group) {
3281 if (idle_cpu(i) && !first_idle_cpu) {
3282 first_idle_cpu = 1;
3283 balance_cpu = i;
3284 }
3285
3286 load = target_load(i, load_idx);
3287 } else {
3288 load = source_load(i, load_idx);
3289 if (load > max_cpu_load)
3290 max_cpu_load = load;
3291 if (min_cpu_load > load)
3292 min_cpu_load = load;
3293 }
3294
3295 sgs->group_load += load;
3296 sgs->sum_nr_running += rq->nr_running;
3297 sgs->sum_weighted_load += weighted_cpuload(i);
3298
3299 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3300 }
3301
3302 /*
3303 * First idle cpu or the first cpu(busiest) in this sched group
3304 * is eligible for doing load balancing at this and above
3305 * domains. In the newly idle case, we will allow all the cpu's
3306 * to do the newly idle load balance.
3307 */
3308 if (idle != CPU_NEWLY_IDLE && local_group &&
3309 balance_cpu != this_cpu && balance) {
3310 *balance = 0;
3311 return;
3312 }
3313
3314 /* Adjust by relative CPU power of the group */
3315 sgs->avg_load = sg_div_cpu_power(group,
3316 sgs->group_load * SCHED_LOAD_SCALE);
3317
3318
3319 /*
3320 * Consider the group unbalanced when the imbalance is larger
3321 * than the average weight of two tasks.
3322 *
3323 * APZ: with cgroup the avg task weight can vary wildly and
3324 * might not be a suitable number - should we keep a
3325 * normalized nr_running number somewhere that negates
3326 * the hierarchy?
3327 */
3328 avg_load_per_task = sg_div_cpu_power(group,
3329 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3330
3331 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3332 sgs->group_imb = 1;
3333
3334 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3335
3336}
3240/******* find_busiest_group() helpers end here *********************/ 3337/******* find_busiest_group() helpers end here *********************/
3241 3338
3242/* 3339/*
@@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3270 3367
3271 do { 3368 do {
3272 struct sg_lb_stats sgs; 3369 struct sg_lb_stats sgs;
3273 unsigned long load, max_cpu_load, min_cpu_load;
3274 int local_group; 3370 int local_group;
3275 int i;
3276 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3277 unsigned long sum_avg_load_per_task;
3278 unsigned long avg_load_per_task;
3279 3371
3280 local_group = cpumask_test_cpu(this_cpu, 3372 local_group = cpumask_test_cpu(this_cpu,
3281 sched_group_cpus(group)); 3373 sched_group_cpus(group));
3282 memset(&sgs, 0, sizeof(sgs)); 3374 memset(&sgs, 0, sizeof(sgs));
3375 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3376 local_group, cpus, balance, &sgs);
3283 3377
3284 if (local_group) 3378 if (balance && !(*balance))
3285 balance_cpu = group_first_cpu(group);
3286
3287 /* Tally up the load of all CPUs in the group */
3288 sum_avg_load_per_task = avg_load_per_task = 0;
3289
3290 max_cpu_load = 0;
3291 min_cpu_load = ~0UL;
3292
3293 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3294 struct rq *rq = cpu_rq(i);
3295
3296 if (*sd_idle && rq->nr_running)
3297 *sd_idle = 0;
3298
3299 /* Bias balancing toward cpus of our domain */
3300 if (local_group) {
3301 if (idle_cpu(i) && !first_idle_cpu) {
3302 first_idle_cpu = 1;
3303 balance_cpu = i;
3304 }
3305
3306 load = target_load(i, load_idx);
3307 } else {
3308 load = source_load(i, load_idx);
3309 if (load > max_cpu_load)
3310 max_cpu_load = load;
3311 if (min_cpu_load > load)
3312 min_cpu_load = load;
3313 }
3314
3315 sgs.group_load += load;
3316 sgs.sum_nr_running += rq->nr_running;
3317 sgs.sum_weighted_load += weighted_cpuload(i);
3318
3319 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3320 }
3321
3322 /*
3323 * First idle cpu or the first cpu(busiest) in this sched group
3324 * is eligible for doing load balancing at this and above
3325 * domains. In the newly idle case, we will allow all the cpu's
3326 * to do the newly idle load balance.
3327 */
3328 if (idle != CPU_NEWLY_IDLE && local_group &&
3329 balance_cpu != this_cpu && balance) {
3330 *balance = 0;
3331 goto ret; 3379 goto ret;
3332 }
3333 3380
3334 total_load += sgs.group_load; 3381 total_load += sgs.group_load;
3335 total_pwr += group->__cpu_power; 3382 total_pwr += group->__cpu_power;
3336 3383
3337 /* Adjust by relative CPU power of the group */
3338 sgs.avg_load = sg_div_cpu_power(group,
3339 sgs.group_load * SCHED_LOAD_SCALE);
3340
3341
3342 /*
3343 * Consider the group unbalanced when the imbalance is larger
3344 * than the average weight of two tasks.
3345 *
3346 * APZ: with cgroup the avg task weight can vary wildly and
3347 * might not be a suitable number - should we keep a
3348 * normalized nr_running number somewhere that negates
3349 * the hierarchy?
3350 */
3351 avg_load_per_task = sg_div_cpu_power(group,
3352 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3353
3354 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3355 sgs.group_imb = 1;
3356
3357 sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3358
3359 if (local_group) { 3384 if (local_group) {
3360 this_load = sgs.avg_load; 3385 this_load = sgs.avg_load;
3361 this = group; 3386 this = group;