aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-26 10:25:24 -0400
committerIngo Molnar <mingo@elte.hu>2009-03-26 10:25:24 -0400
commit66fef08f7d5267b2312c4b48a6d2957d2d414105 (patch)
treeee6fb1ce3354c8a06c7c1b08b2c386aaebf07a33
parentb6d9842258d1ba27fb978cded74eb4b6aa15edc8 (diff)
parentb7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 (diff)
Merge branch 'sched/balancing' into sched/core
-rw-r--r--kernel/sched.c765
1 files changed, 515 insertions, 250 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 7b389c74f8ff..9f8506d68fdc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3189 3189
3190 return 0; 3190 return 0;
3191} 3191}
3192/********** Helpers for find_busiest_group ************************/
3193/**
3194 * sd_lb_stats - Structure to store the statistics of a sched_domain
3195 * during load balancing.
3196 */
3197struct sd_lb_stats {
3198 struct sched_group *busiest; /* Busiest group in this sd */
3199 struct sched_group *this; /* Local group in this sd */
3200 unsigned long total_load; /* Total load of all groups in sd */
3201 unsigned long total_pwr; /* Total power of all groups in sd */
3202 unsigned long avg_load; /* Average load across all groups in sd */
3203
3204 /** Statistics of this group */
3205 unsigned long this_load;
3206 unsigned long this_load_per_task;
3207 unsigned long this_nr_running;
3208
3209 /* Statistics of the busiest group */
3210 unsigned long max_load;
3211 unsigned long busiest_load_per_task;
3212 unsigned long busiest_nr_running;
3213
3214 int group_imb; /* Is there imbalance in this sd */
3215#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3216 int power_savings_balance; /* Is powersave balance needed for this sd */
3217 struct sched_group *group_min; /* Least loaded group in sd */
3218 struct sched_group *group_leader; /* Group which relieves group_min */
3219 unsigned long min_load_per_task; /* load_per_task in group_min */
3220 unsigned long leader_nr_running; /* Nr running of group_leader */
3221 unsigned long min_nr_running; /* Nr running of group_min */
3222#endif
3223};
3192 3224
3193/* 3225/**
3194 * find_busiest_group finds and returns the busiest CPU group within the 3226 * sg_lb_stats - stats of a sched_group required for load_balancing
3195 * domain. It calculates and returns the amount of weighted load which 3227 */
3196 * should be moved to restore balance via the imbalance parameter. 3228struct sg_lb_stats {
3229 unsigned long avg_load; /*Avg load across the CPUs of the group */
3230 unsigned long group_load; /* Total load over the CPUs of the group */
3231 unsigned long sum_nr_running; /* Nr tasks running in the group */
3232 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3233 unsigned long group_capacity;
3234 int group_imb; /* Is there an imbalance in the group ? */
3235};
3236
3237/**
3238 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3239 * @group: The group whose first cpu is to be returned.
3197 */ 3240 */
3198static struct sched_group * 3241static inline unsigned int group_first_cpu(struct sched_group *group)
3199find_busiest_group(struct sched_domain *sd, int this_cpu,
3200 unsigned long *imbalance, enum cpu_idle_type idle,
3201 int *sd_idle, const struct cpumask *cpus, int *balance)
3202{ 3242{
3203 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3243 return cpumask_first(sched_group_cpus(group));
3204 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3244}
3205 unsigned long max_pull;
3206 unsigned long busiest_load_per_task, busiest_nr_running;
3207 unsigned long this_load_per_task, this_nr_running;
3208 int load_idx, group_imb = 0;
3209#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3210 int power_savings_balance = 1;
3211 unsigned long leader_nr_running = 0, min_load_per_task = 0;
3212 unsigned long min_nr_running = ULONG_MAX;
3213 struct sched_group *group_min = NULL, *group_leader = NULL;
3214#endif
3215 3245
3216 max_load = this_load = total_load = total_pwr = 0; 3246/**
3217 busiest_load_per_task = busiest_nr_running = 0; 3247 * get_sd_load_idx - Obtain the load index for a given sched domain.
3218 this_load_per_task = this_nr_running = 0; 3248 * @sd: The sched_domain whose load_idx is to be obtained.
3249 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3250 */
3251static inline int get_sd_load_idx(struct sched_domain *sd,
3252 enum cpu_idle_type idle)
3253{
3254 int load_idx;
3219 3255
3220 if (idle == CPU_NOT_IDLE) 3256 switch (idle) {
3257 case CPU_NOT_IDLE:
3221 load_idx = sd->busy_idx; 3258 load_idx = sd->busy_idx;
3222 else if (idle == CPU_NEWLY_IDLE) 3259 break;
3260
3261 case CPU_NEWLY_IDLE:
3223 load_idx = sd->newidle_idx; 3262 load_idx = sd->newidle_idx;
3224 else 3263 break;
3264 default:
3225 load_idx = sd->idle_idx; 3265 load_idx = sd->idle_idx;
3266 break;
3267 }
3226 3268
3227 do { 3269 return load_idx;
3228 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3270}
3229 int local_group;
3230 int i;
3231 int __group_imb = 0;
3232 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3233 unsigned long sum_nr_running, sum_weighted_load;
3234 unsigned long sum_avg_load_per_task;
3235 unsigned long avg_load_per_task;
3236 3271
3237 local_group = cpumask_test_cpu(this_cpu,
3238 sched_group_cpus(group));
3239 3272
3240 if (local_group) 3273#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3241 balance_cpu = cpumask_first(sched_group_cpus(group)); 3274/**
3275 * init_sd_power_savings_stats - Initialize power savings statistics for
3276 * the given sched_domain, during load balancing.
3277 *
3278 * @sd: Sched domain whose power-savings statistics are to be initialized.
3279 * @sds: Variable containing the statistics for sd.
3280 * @idle: Idle status of the CPU at which we're performing load-balancing.
3281 */
3282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3284{
3285 /*
3286 * Busy processors will not participate in power savings
3287 * balance.
3288 */
3289 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3290 sds->power_savings_balance = 0;
3291 else {
3292 sds->power_savings_balance = 1;
3293 sds->min_nr_running = ULONG_MAX;
3294 sds->leader_nr_running = 0;
3295 }
3296}
3242 3297
3243 /* Tally up the load of all CPUs in the group */ 3298/**
3244 sum_weighted_load = sum_nr_running = avg_load = 0; 3299 * update_sd_power_savings_stats - Update the power saving stats for a
3245 sum_avg_load_per_task = avg_load_per_task = 0; 3300 * sched_domain while performing load balancing.
3301 *
3302 * @group: sched_group belonging to the sched_domain under consideration.
3303 * @sds: Variable containing the statistics of the sched_domain
3304 * @local_group: Does group contain the CPU for which we're performing
3305 * load balancing ?
3306 * @sgs: Variable containing the statistics of the group.
3307 */
3308static inline void update_sd_power_savings_stats(struct sched_group *group,
3309 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3310{
3246 3311
3247 max_cpu_load = 0; 3312 if (!sds->power_savings_balance)
3248 min_cpu_load = ~0UL; 3313 return;
3249 3314
3250 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3315 /*
3251 struct rq *rq = cpu_rq(i); 3316 * If the local group is idle or completely loaded
3317 * no need to do power savings balance at this domain
3318 */
3319 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3320 !sds->this_nr_running))
3321 sds->power_savings_balance = 0;
3252 3322
3253 if (*sd_idle && rq->nr_running) 3323 /*
3254 *sd_idle = 0; 3324 * If a group is already running at full capacity or idle,
3325 * don't include that group in power savings calculations
3326 */
3327 if (!sds->power_savings_balance ||
3328 sgs->sum_nr_running >= sgs->group_capacity ||
3329 !sgs->sum_nr_running)
3330 return;
3255 3331
3256 /* Bias balancing toward cpus of our domain */ 3332 /*
3257 if (local_group) { 3333 * Calculate the group which has the least non-idle load.
3258 if (idle_cpu(i) && !first_idle_cpu) { 3334 * This is the group from where we need to pick up the load
3259 first_idle_cpu = 1; 3335 * for saving power
3260 balance_cpu = i; 3336 */
3261 } 3337 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3338 (sgs->sum_nr_running == sds->min_nr_running &&
3339 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3340 sds->group_min = group;
3341 sds->min_nr_running = sgs->sum_nr_running;
3342 sds->min_load_per_task = sgs->sum_weighted_load /
3343 sgs->sum_nr_running;
3344 }
3262 3345
3263 load = target_load(i, load_idx); 3346 /*
3264 } else { 3347 * Calculate the group which is almost near its
3265 load = source_load(i, load_idx); 3348 * capacity but still has some space to pick up some load
3266 if (load > max_cpu_load) 3349 * from other group and save more power
3267 max_cpu_load = load; 3350 */
3268 if (min_cpu_load > load) 3351 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3269 min_cpu_load = load; 3352 return;
3270 }
3271 3353
3272 avg_load += load; 3354 if (sgs->sum_nr_running > sds->leader_nr_running ||
3273 sum_nr_running += rq->nr_running; 3355 (sgs->sum_nr_running == sds->leader_nr_running &&
3274 sum_weighted_load += weighted_cpuload(i); 3356 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3357 sds->group_leader = group;
3358 sds->leader_nr_running = sgs->sum_nr_running;
3359 }
3360}
3275 3361
3276 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3362/**
3277 } 3363 * check_power_save_busiest_group - Check if we have potential to perform
3364 * some power-savings balance. If yes, set the busiest group to be
3365 * the least loaded group in the sched_domain, so that it's CPUs can
3366 * be put to idle.
3367 *
3368 * @sds: Variable containing the statistics of the sched_domain
3369 * under consideration.
3370 * @this_cpu: Cpu at which we're currently performing load-balancing.
3371 * @imbalance: Variable to store the imbalance.
3372 *
3373 * Returns 1 if there is potential to perform power-savings balance.
3374 * Else returns 0.
3375 */
3376static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3377 int this_cpu, unsigned long *imbalance)
3378{
3379 if (!sds->power_savings_balance)
3380 return 0;
3278 3381
3279 /* 3382 if (sds->this != sds->group_leader ||
3280 * First idle cpu or the first cpu(busiest) in this sched group 3383 sds->group_leader == sds->group_min)
3281 * is eligible for doing load balancing at this and above 3384 return 0;
3282 * domains. In the newly idle case, we will allow all the cpu's
3283 * to do the newly idle load balance.
3284 */
3285 if (idle != CPU_NEWLY_IDLE && local_group &&
3286 balance_cpu != this_cpu && balance) {
3287 *balance = 0;
3288 goto ret;
3289 }
3290 3385
3291 total_load += avg_load; 3386 *imbalance = sds->min_load_per_task;
3292 total_pwr += group->__cpu_power; 3387 sds->busiest = sds->group_min;
3293 3388
3294 /* Adjust by relative CPU power of the group */ 3389 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3295 avg_load = sg_div_cpu_power(group, 3390 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3296 avg_load * SCHED_LOAD_SCALE); 3391 group_first_cpu(sds->group_leader);
3392 }
3297 3393
3394 return 1;
3298 3395
3299 /* 3396}
3300 * Consider the group unbalanced when the imbalance is larger 3397#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3301 * than the average weight of two tasks. 3398static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3302 * 3399 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3303 * APZ: with cgroup the avg task weight can vary wildly and 3400{
3304 * might not be a suitable number - should we keep a 3401 return;
3305 * normalized nr_running number somewhere that negates 3402}
3306 * the hierarchy? 3403
3307 */ 3404static inline void update_sd_power_savings_stats(struct sched_group *group,
3308 avg_load_per_task = sg_div_cpu_power(group, 3405 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3309 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3406{
3407 return;
3408}
3409
3410static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3411 int this_cpu, unsigned long *imbalance)
3412{
3413 return 0;
3414}
3415#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3416
3417
3418/**
3419 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3420 * @group: sched_group whose statistics are to be updated.
3421 * @this_cpu: Cpu for which load balance is currently performed.
3422 * @idle: Idle status of this_cpu
3423 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3424 * @sd_idle: Idle status of the sched_domain containing group.
3425 * @local_group: Does group contain this_cpu.
3426 * @cpus: Set of cpus considered for load balancing.
3427 * @balance: Should we balance.
3428 * @sgs: variable to hold the statistics for this group.
3429 */
3430static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3431 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3432 int local_group, const struct cpumask *cpus,
3433 int *balance, struct sg_lb_stats *sgs)
3434{
3435 unsigned long load, max_cpu_load, min_cpu_load;
3436 int i;
3437 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3438 unsigned long sum_avg_load_per_task;
3439 unsigned long avg_load_per_task;
3310 3440
3311 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3441 if (local_group)
3312 __group_imb = 1; 3442 balance_cpu = group_first_cpu(group);
3313 3443
3314 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3444 /* Tally up the load of all CPUs in the group */
3445 sum_avg_load_per_task = avg_load_per_task = 0;
3446 max_cpu_load = 0;
3447 min_cpu_load = ~0UL;
3315 3448
3449 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3450 struct rq *rq = cpu_rq(i);
3451
3452 if (*sd_idle && rq->nr_running)
3453 *sd_idle = 0;
3454
3455 /* Bias balancing toward cpus of our domain */
3316 if (local_group) { 3456 if (local_group) {
3317 this_load = avg_load; 3457 if (idle_cpu(i) && !first_idle_cpu) {
3318 this = group; 3458 first_idle_cpu = 1;
3319 this_nr_running = sum_nr_running; 3459 balance_cpu = i;
3320 this_load_per_task = sum_weighted_load; 3460 }
3321 } else if (avg_load > max_load && 3461
3322 (sum_nr_running > group_capacity || __group_imb)) { 3462 load = target_load(i, load_idx);
3323 max_load = avg_load; 3463 } else {
3324 busiest = group; 3464 load = source_load(i, load_idx);
3325 busiest_nr_running = sum_nr_running; 3465 if (load > max_cpu_load)
3326 busiest_load_per_task = sum_weighted_load; 3466 max_cpu_load = load;
3327 group_imb = __group_imb; 3467 if (min_cpu_load > load)
3468 min_cpu_load = load;
3328 } 3469 }
3329 3470
3330#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3471 sgs->group_load += load;
3331 /* 3472 sgs->sum_nr_running += rq->nr_running;
3332 * Busy processors will not participate in power savings 3473 sgs->sum_weighted_load += weighted_cpuload(i);
3333 * balance.
3334 */
3335 if (idle == CPU_NOT_IDLE ||
3336 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3337 goto group_next;
3338 3474
3339 /* 3475 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3340 * If the local group is idle or completely loaded 3476 }
3341 * no need to do power savings balance at this domain
3342 */
3343 if (local_group && (this_nr_running >= group_capacity ||
3344 !this_nr_running))
3345 power_savings_balance = 0;
3346 3477
3347 /* 3478 /*
3348 * If a group is already running at full capacity or idle, 3479 * First idle cpu or the first cpu(busiest) in this sched group
3349 * don't include that group in power savings calculations 3480 * is eligible for doing load balancing at this and above
3350 */ 3481 * domains. In the newly idle case, we will allow all the cpu's
3351 if (!power_savings_balance || sum_nr_running >= group_capacity 3482 * to do the newly idle load balance.
3352 || !sum_nr_running) 3483 */
3353 goto group_next; 3484 if (idle != CPU_NEWLY_IDLE && local_group &&
3485 balance_cpu != this_cpu && balance) {
3486 *balance = 0;
3487 return;
3488 }
3354 3489
3355 /* 3490 /* Adjust by relative CPU power of the group */
3356 * Calculate the group which has the least non-idle load. 3491 sgs->avg_load = sg_div_cpu_power(group,
3357 * This is the group from where we need to pick up the load 3492 sgs->group_load * SCHED_LOAD_SCALE);
3358 * for saving power
3359 */
3360 if ((sum_nr_running < min_nr_running) ||
3361 (sum_nr_running == min_nr_running &&
3362 cpumask_first(sched_group_cpus(group)) >
3363 cpumask_first(sched_group_cpus(group_min)))) {
3364 group_min = group;
3365 min_nr_running = sum_nr_running;
3366 min_load_per_task = sum_weighted_load /
3367 sum_nr_running;
3368 }
3369 3493
3370 /* 3494
3371 * Calculate the group which is almost near its 3495 /*
3372 * capacity but still has some space to pick up some load 3496 * Consider the group unbalanced when the imbalance is larger
3373 * from other group and save more power 3497 * than the average weight of two tasks.
3374 */ 3498 *
3375 if (sum_nr_running <= group_capacity - 1) { 3499 * APZ: with cgroup the avg task weight can vary wildly and
3376 if (sum_nr_running > leader_nr_running || 3500 * might not be a suitable number - should we keep a
3377 (sum_nr_running == leader_nr_running && 3501 * normalized nr_running number somewhere that negates
3378 cpumask_first(sched_group_cpus(group)) < 3502 * the hierarchy?
3379 cpumask_first(sched_group_cpus(group_leader)))) { 3503 */
3380 group_leader = group; 3504 avg_load_per_task = sg_div_cpu_power(group,
3381 leader_nr_running = sum_nr_running; 3505 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3382 } 3506
3507 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3508 sgs->group_imb = 1;
3509
3510 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3511
3512}
3513
3514/**
3515 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3516 * @sd: sched_domain whose statistics are to be updated.
3517 * @this_cpu: Cpu for which load balance is currently performed.
3518 * @idle: Idle status of this_cpu
3519 * @sd_idle: Idle status of the sched_domain containing group.
3520 * @cpus: Set of cpus considered for load balancing.
3521 * @balance: Should we balance.
3522 * @sds: variable to hold the statistics for this sched_domain.
3523 */
3524static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3525 enum cpu_idle_type idle, int *sd_idle,
3526 const struct cpumask *cpus, int *balance,
3527 struct sd_lb_stats *sds)
3528{
3529 struct sched_group *group = sd->groups;
3530 struct sg_lb_stats sgs;
3531 int load_idx;
3532
3533 init_sd_power_savings_stats(sd, sds, idle);
3534 load_idx = get_sd_load_idx(sd, idle);
3535
3536 do {
3537 int local_group;
3538
3539 local_group = cpumask_test_cpu(this_cpu,
3540 sched_group_cpus(group));
3541 memset(&sgs, 0, sizeof(sgs));
3542 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3543 local_group, cpus, balance, &sgs);
3544
3545 if (local_group && balance && !(*balance))
3546 return;
3547
3548 sds->total_load += sgs.group_load;
3549 sds->total_pwr += group->__cpu_power;
3550
3551 if (local_group) {
3552 sds->this_load = sgs.avg_load;
3553 sds->this = group;
3554 sds->this_nr_running = sgs.sum_nr_running;
3555 sds->this_load_per_task = sgs.sum_weighted_load;
3556 } else if (sgs.avg_load > sds->max_load &&
3557 (sgs.sum_nr_running > sgs.group_capacity ||
3558 sgs.group_imb)) {
3559 sds->max_load = sgs.avg_load;
3560 sds->busiest = group;
3561 sds->busiest_nr_running = sgs.sum_nr_running;
3562 sds->busiest_load_per_task = sgs.sum_weighted_load;
3563 sds->group_imb = sgs.group_imb;
3383 } 3564 }
3384group_next: 3565
3385#endif 3566 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3386 group = group->next; 3567 group = group->next;
3387 } while (group != sd->groups); 3568 } while (group != sd->groups);
3388 3569
3389 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3570}
3390 goto out_balanced;
3391
3392 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3393 3571
3394 if (this_load >= avg_load || 3572/**
3395 100*max_load <= sd->imbalance_pct*this_load) 3573 * fix_small_imbalance - Calculate the minor imbalance that exists
3396 goto out_balanced; 3574 * amongst the groups of a sched_domain, during
3575 * load balancing.
3576 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3577 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3578 * @imbalance: Variable to store the imbalance.
3579 */
3580static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3581 int this_cpu, unsigned long *imbalance)
3582{
3583 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3584 unsigned int imbn = 2;
3585
3586 if (sds->this_nr_running) {
3587 sds->this_load_per_task /= sds->this_nr_running;
3588 if (sds->busiest_load_per_task >
3589 sds->this_load_per_task)
3590 imbn = 1;
3591 } else
3592 sds->this_load_per_task =
3593 cpu_avg_load_per_task(this_cpu);
3397 3594
3398 busiest_load_per_task /= busiest_nr_running; 3595 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3399 if (group_imb) 3596 sds->busiest_load_per_task * imbn) {
3400 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3597 *imbalance = sds->busiest_load_per_task;
3598 return;
3599 }
3401 3600
3402 /* 3601 /*
3403 * We're trying to get all the cpus to the average_load, so we don't 3602 * OK, we don't have enough imbalance to justify moving tasks,
3404 * want to push ourselves above the average load, nor do we wish to 3603 * however we may be able to increase total CPU power used by
3405 * reduce the max loaded cpu below the average load, as either of these 3604 * moving them.
3406 * actions would just result in more rebalancing later, and ping-pong
3407 * tasks around. Thus we look for the minimum possible imbalance.
3408 * Negative imbalances (*we* are more loaded than anyone else) will
3409 * be counted as no imbalance for these purposes -- we can't fix that
3410 * by pulling tasks to us. Be careful of negative numbers as they'll
3411 * appear as very large values with unsigned longs.
3412 */ 3605 */
3413 if (max_load <= busiest_load_per_task)
3414 goto out_balanced;
3415 3606
3607 pwr_now += sds->busiest->__cpu_power *
3608 min(sds->busiest_load_per_task, sds->max_load);
3609 pwr_now += sds->this->__cpu_power *
3610 min(sds->this_load_per_task, sds->this_load);
3611 pwr_now /= SCHED_LOAD_SCALE;
3612
3613 /* Amount of load we'd subtract */
3614 tmp = sg_div_cpu_power(sds->busiest,
3615 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3616 if (sds->max_load > tmp)
3617 pwr_move += sds->busiest->__cpu_power *
3618 min(sds->busiest_load_per_task, sds->max_load - tmp);
3619
3620 /* Amount of load we'd add */
3621 if (sds->max_load * sds->busiest->__cpu_power <
3622 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3623 tmp = sg_div_cpu_power(sds->this,
3624 sds->max_load * sds->busiest->__cpu_power);
3625 else
3626 tmp = sg_div_cpu_power(sds->this,
3627 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3628 pwr_move += sds->this->__cpu_power *
3629 min(sds->this_load_per_task, sds->this_load + tmp);
3630 pwr_move /= SCHED_LOAD_SCALE;
3631
3632 /* Move if we gain throughput */
3633 if (pwr_move > pwr_now)
3634 *imbalance = sds->busiest_load_per_task;
3635}
3636
3637/**
3638 * calculate_imbalance - Calculate the amount of imbalance present within the
3639 * groups of a given sched_domain during load balance.
3640 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3641 * @this_cpu: Cpu for which currently load balance is being performed.
3642 * @imbalance: The variable to store the imbalance.
3643 */
3644static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3645 unsigned long *imbalance)
3646{
3647 unsigned long max_pull;
3416 /* 3648 /*
3417 * In the presence of smp nice balancing, certain scenarios can have 3649 * In the presence of smp nice balancing, certain scenarios can have
3418 * max load less than avg load(as we skip the groups at or below 3650 * max load less than avg load(as we skip the groups at or below
3419 * its cpu_power, while calculating max_load..) 3651 * its cpu_power, while calculating max_load..)
3420 */ 3652 */
3421 if (max_load < avg_load) { 3653 if (sds->max_load < sds->avg_load) {
3422 *imbalance = 0; 3654 *imbalance = 0;
3423 goto small_imbalance; 3655 return fix_small_imbalance(sds, this_cpu, imbalance);
3424 } 3656 }
3425 3657
3426 /* Don't want to pull so many tasks that a group would go idle */ 3658 /* Don't want to pull so many tasks that a group would go idle */
3427 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3659 max_pull = min(sds->max_load - sds->avg_load,
3660 sds->max_load - sds->busiest_load_per_task);
3428 3661
3429 /* How much load to actually move to equalise the imbalance */ 3662 /* How much load to actually move to equalise the imbalance */
3430 *imbalance = min(max_pull * busiest->__cpu_power, 3663 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3431 (avg_load - this_load) * this->__cpu_power) 3664 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3432 / SCHED_LOAD_SCALE; 3665 / SCHED_LOAD_SCALE;
3433 3666
3434 /* 3667 /*
@@ -3437,78 +3670,110 @@ group_next:
3437 * a think about bumping its value to force at least one task to be 3670 * a think about bumping its value to force at least one task to be
3438 * moved 3671 * moved
3439 */ 3672 */
3440 if (*imbalance < busiest_load_per_task) { 3673 if (*imbalance < sds->busiest_load_per_task)
3441 unsigned long tmp, pwr_now, pwr_move; 3674 return fix_small_imbalance(sds, this_cpu, imbalance);
3442 unsigned int imbn;
3443
3444small_imbalance:
3445 pwr_move = pwr_now = 0;
3446 imbn = 2;
3447 if (this_nr_running) {
3448 this_load_per_task /= this_nr_running;
3449 if (busiest_load_per_task > this_load_per_task)
3450 imbn = 1;
3451 } else
3452 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3453 3675
3454 if (max_load - this_load + busiest_load_per_task >= 3676}
3455 busiest_load_per_task * imbn) { 3677/******* find_busiest_group() helpers end here *********************/
3456 *imbalance = busiest_load_per_task;
3457 return busiest;
3458 }
3459 3678
3460 /* 3679/**
3461 * OK, we don't have enough imbalance to justify moving tasks, 3680 * find_busiest_group - Returns the busiest group within the sched_domain
3462 * however we may be able to increase total CPU power used by 3681 * if there is an imbalance. If there isn't an imbalance, and
3463 * moving them. 3682 * the user has opted for power-savings, it returns a group whose
3464 */ 3683 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3684 * such a group exists.
3685 *
3686 * Also calculates the amount of weighted load which should be moved
3687 * to restore balance.
3688 *
3689 * @sd: The sched_domain whose busiest group is to be returned.
3690 * @this_cpu: The cpu for which load balancing is currently being performed.
3691 * @imbalance: Variable which stores amount of weighted load which should
3692 * be moved to restore balance/put a group to idle.
3693 * @idle: The idle status of this_cpu.
3694 * @sd_idle: The idleness of sd
3695 * @cpus: The set of CPUs under consideration for load-balancing.
3696 * @balance: Pointer to a variable indicating if this_cpu
3697 * is the appropriate cpu to perform load balancing at this_level.
3698 *
3699 * Returns: - the busiest group if imbalance exists.
3700 * - If no imbalance and user has opted for power-savings balance,
3701 * return the least loaded group whose CPUs can be
3702 * put to idle by rebalancing its tasks onto our group.
3703 */
3704static struct sched_group *
3705find_busiest_group(struct sched_domain *sd, int this_cpu,
3706 unsigned long *imbalance, enum cpu_idle_type idle,
3707 int *sd_idle, const struct cpumask *cpus, int *balance)
3708{
3709 struct sd_lb_stats sds;
3465 3710
3466 pwr_now += busiest->__cpu_power * 3711 memset(&sds, 0, sizeof(sds));
3467 min(busiest_load_per_task, max_load);
3468 pwr_now += this->__cpu_power *
3469 min(this_load_per_task, this_load);
3470 pwr_now /= SCHED_LOAD_SCALE;
3471
3472 /* Amount of load we'd subtract */
3473 tmp = sg_div_cpu_power(busiest,
3474 busiest_load_per_task * SCHED_LOAD_SCALE);
3475 if (max_load > tmp)
3476 pwr_move += busiest->__cpu_power *
3477 min(busiest_load_per_task, max_load - tmp);
3478
3479 /* Amount of load we'd add */
3480 if (max_load * busiest->__cpu_power <
3481 busiest_load_per_task * SCHED_LOAD_SCALE)
3482 tmp = sg_div_cpu_power(this,
3483 max_load * busiest->__cpu_power);
3484 else
3485 tmp = sg_div_cpu_power(this,
3486 busiest_load_per_task * SCHED_LOAD_SCALE);
3487 pwr_move += this->__cpu_power *
3488 min(this_load_per_task, this_load + tmp);
3489 pwr_move /= SCHED_LOAD_SCALE;
3490 3712
3491 /* Move if we gain throughput */ 3713 /*
3492 if (pwr_move > pwr_now) 3714 * Compute the various statistics relavent for load balancing at
3493 *imbalance = busiest_load_per_task; 3715 * this level.
3494 } 3716 */
3717 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3718 balance, &sds);
3719
3720 /* Cases where imbalance does not exist from POV of this_cpu */
3721 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3722 * at this level.
3723 * 2) There is no busy sibling group to pull from.
3724 * 3) This group is the busiest group.
3725 * 4) This group is more busy than the avg busieness at this
3726 * sched_domain.
3727 * 5) The imbalance is within the specified limit.
3728 * 6) Any rebalance would lead to ping-pong
3729 */
3730 if (balance && !(*balance))
3731 goto ret;
3495 3732
3496 return busiest; 3733 if (!sds.busiest || sds.busiest_nr_running == 0)
3734 goto out_balanced;
3497 3735
3498out_balanced: 3736 if (sds.this_load >= sds.max_load)
3499#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3737 goto out_balanced;
3500 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3501 goto ret;
3502 3738
3503 if (this == group_leader && group_leader != group_min) { 3739 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3504 *imbalance = min_load_per_task; 3740
3505 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3741 if (sds.this_load >= sds.avg_load)
3506 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3742 goto out_balanced;
3507 cpumask_first(sched_group_cpus(group_leader)); 3743
3508 } 3744 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3509 return group_min; 3745 goto out_balanced;
3510 } 3746
3511#endif 3747 sds.busiest_load_per_task /= sds.busiest_nr_running;
3748 if (sds.group_imb)
3749 sds.busiest_load_per_task =
3750 min(sds.busiest_load_per_task, sds.avg_load);
3751
3752 /*
3753 * We're trying to get all the cpus to the average_load, so we don't
3754 * want to push ourselves above the average load, nor do we wish to
3755 * reduce the max loaded cpu below the average load, as either of these
3756 * actions would just result in more rebalancing later, and ping-pong
3757 * tasks around. Thus we look for the minimum possible imbalance.
3758 * Negative imbalances (*we* are more loaded than anyone else) will
3759 * be counted as no imbalance for these purposes -- we can't fix that
3760 * by pulling tasks to us. Be careful of negative numbers as they'll
3761 * appear as very large values with unsigned longs.
3762 */
3763 if (sds.max_load <= sds.busiest_load_per_task)
3764 goto out_balanced;
3765
3766 /* Looks like there is an imbalance. Compute it */
3767 calculate_imbalance(&sds, this_cpu, imbalance);
3768 return sds.busiest;
3769
3770out_balanced:
3771 /*
3772 * There is no obvious imbalance. But check if we can do some balancing
3773 * to save power.
3774 */
3775 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3776 return sds.busiest;
3512ret: 3777ret:
3513 *imbalance = 0; 3778 *imbalance = 0;
3514 return NULL; 3779 return NULL;