diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-03-26 10:25:24 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-03-26 10:25:24 -0400 |
commit | 66fef08f7d5267b2312c4b48a6d2957d2d414105 (patch) | |
tree | ee6fb1ce3354c8a06c7c1b08b2c386aaebf07a33 | |
parent | b6d9842258d1ba27fb978cded74eb4b6aa15edc8 (diff) | |
parent | b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 (diff) |
Merge branch 'sched/balancing' into sched/core
-rw-r--r-- | kernel/sched.c | 765 |
1 files changed, 515 insertions, 250 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 7b389c74f8ff..9f8506d68fdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3189 | 3189 | ||
3190 | return 0; | 3190 | return 0; |
3191 | } | 3191 | } |
3192 | /********** Helpers for find_busiest_group ************************/ | ||
3193 | /** | ||
3194 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
3195 | * during load balancing. | ||
3196 | */ | ||
3197 | struct sd_lb_stats { | ||
3198 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
3199 | struct sched_group *this; /* Local group in this sd */ | ||
3200 | unsigned long total_load; /* Total load of all groups in sd */ | ||
3201 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
3202 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
3203 | |||
3204 | /** Statistics of this group */ | ||
3205 | unsigned long this_load; | ||
3206 | unsigned long this_load_per_task; | ||
3207 | unsigned long this_nr_running; | ||
3208 | |||
3209 | /* Statistics of the busiest group */ | ||
3210 | unsigned long max_load; | ||
3211 | unsigned long busiest_load_per_task; | ||
3212 | unsigned long busiest_nr_running; | ||
3213 | |||
3214 | int group_imb; /* Is there imbalance in this sd */ | ||
3215 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3216 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3217 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3218 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3219 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3220 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3221 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3222 | #endif | ||
3223 | }; | ||
3192 | 3224 | ||
3193 | /* | 3225 | /** |
3194 | * find_busiest_group finds and returns the busiest CPU group within the | 3226 | * sg_lb_stats - stats of a sched_group required for load_balancing |
3195 | * domain. It calculates and returns the amount of weighted load which | 3227 | */ |
3196 | * should be moved to restore balance via the imbalance parameter. | 3228 | struct sg_lb_stats { |
3229 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
3230 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
3231 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
3232 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
3233 | unsigned long group_capacity; | ||
3234 | int group_imb; /* Is there an imbalance in the group ? */ | ||
3235 | }; | ||
3236 | |||
3237 | /** | ||
3238 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3239 | * @group: The group whose first cpu is to be returned. | ||
3197 | */ | 3240 | */ |
3198 | static struct sched_group * | 3241 | static inline unsigned int group_first_cpu(struct sched_group *group) |
3199 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
3200 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
3201 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
3202 | { | 3242 | { |
3203 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3243 | return cpumask_first(sched_group_cpus(group)); |
3204 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3244 | } |
3205 | unsigned long max_pull; | ||
3206 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
3207 | unsigned long this_load_per_task, this_nr_running; | ||
3208 | int load_idx, group_imb = 0; | ||
3209 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3210 | int power_savings_balance = 1; | ||
3211 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
3212 | unsigned long min_nr_running = ULONG_MAX; | ||
3213 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
3214 | #endif | ||
3215 | 3245 | ||
3216 | max_load = this_load = total_load = total_pwr = 0; | 3246 | /** |
3217 | busiest_load_per_task = busiest_nr_running = 0; | 3247 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
3218 | this_load_per_task = this_nr_running = 0; | 3248 | * @sd: The sched_domain whose load_idx is to be obtained. |
3249 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
3250 | */ | ||
3251 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
3252 | enum cpu_idle_type idle) | ||
3253 | { | ||
3254 | int load_idx; | ||
3219 | 3255 | ||
3220 | if (idle == CPU_NOT_IDLE) | 3256 | switch (idle) { |
3257 | case CPU_NOT_IDLE: | ||
3221 | load_idx = sd->busy_idx; | 3258 | load_idx = sd->busy_idx; |
3222 | else if (idle == CPU_NEWLY_IDLE) | 3259 | break; |
3260 | |||
3261 | case CPU_NEWLY_IDLE: | ||
3223 | load_idx = sd->newidle_idx; | 3262 | load_idx = sd->newidle_idx; |
3224 | else | 3263 | break; |
3264 | default: | ||
3225 | load_idx = sd->idle_idx; | 3265 | load_idx = sd->idle_idx; |
3266 | break; | ||
3267 | } | ||
3226 | 3268 | ||
3227 | do { | 3269 | return load_idx; |
3228 | unsigned long load, group_capacity, max_cpu_load, min_cpu_load; | 3270 | } |
3229 | int local_group; | ||
3230 | int i; | ||
3231 | int __group_imb = 0; | ||
3232 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
3233 | unsigned long sum_nr_running, sum_weighted_load; | ||
3234 | unsigned long sum_avg_load_per_task; | ||
3235 | unsigned long avg_load_per_task; | ||
3236 | 3271 | ||
3237 | local_group = cpumask_test_cpu(this_cpu, | ||
3238 | sched_group_cpus(group)); | ||
3239 | 3272 | ||
3240 | if (local_group) | 3273 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
3241 | balance_cpu = cpumask_first(sched_group_cpus(group)); | 3274 | /** |
3275 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3276 | * the given sched_domain, during load balancing. | ||
3277 | * | ||
3278 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3279 | * @sds: Variable containing the statistics for sd. | ||
3280 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3281 | */ | ||
3282 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3283 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3284 | { | ||
3285 | /* | ||
3286 | * Busy processors will not participate in power savings | ||
3287 | * balance. | ||
3288 | */ | ||
3289 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3290 | sds->power_savings_balance = 0; | ||
3291 | else { | ||
3292 | sds->power_savings_balance = 1; | ||
3293 | sds->min_nr_running = ULONG_MAX; | ||
3294 | sds->leader_nr_running = 0; | ||
3295 | } | ||
3296 | } | ||
3242 | 3297 | ||
3243 | /* Tally up the load of all CPUs in the group */ | 3298 | /** |
3244 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3299 | * update_sd_power_savings_stats - Update the power saving stats for a |
3245 | sum_avg_load_per_task = avg_load_per_task = 0; | 3300 | * sched_domain while performing load balancing. |
3301 | * | ||
3302 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3303 | * @sds: Variable containing the statistics of the sched_domain | ||
3304 | * @local_group: Does group contain the CPU for which we're performing | ||
3305 | * load balancing ? | ||
3306 | * @sgs: Variable containing the statistics of the group. | ||
3307 | */ | ||
3308 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3309 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3310 | { | ||
3246 | 3311 | ||
3247 | max_cpu_load = 0; | 3312 | if (!sds->power_savings_balance) |
3248 | min_cpu_load = ~0UL; | 3313 | return; |
3249 | 3314 | ||
3250 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3315 | /* |
3251 | struct rq *rq = cpu_rq(i); | 3316 | * If the local group is idle or completely loaded |
3317 | * no need to do power savings balance at this domain | ||
3318 | */ | ||
3319 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3320 | !sds->this_nr_running)) | ||
3321 | sds->power_savings_balance = 0; | ||
3252 | 3322 | ||
3253 | if (*sd_idle && rq->nr_running) | 3323 | /* |
3254 | *sd_idle = 0; | 3324 | * If a group is already running at full capacity or idle, |
3325 | * don't include that group in power savings calculations | ||
3326 | */ | ||
3327 | if (!sds->power_savings_balance || | ||
3328 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3329 | !sgs->sum_nr_running) | ||
3330 | return; | ||
3255 | 3331 | ||
3256 | /* Bias balancing toward cpus of our domain */ | 3332 | /* |
3257 | if (local_group) { | 3333 | * Calculate the group which has the least non-idle load. |
3258 | if (idle_cpu(i) && !first_idle_cpu) { | 3334 | * This is the group from where we need to pick up the load |
3259 | first_idle_cpu = 1; | 3335 | * for saving power |
3260 | balance_cpu = i; | 3336 | */ |
3261 | } | 3337 | if ((sgs->sum_nr_running < sds->min_nr_running) || |
3338 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3339 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3340 | sds->group_min = group; | ||
3341 | sds->min_nr_running = sgs->sum_nr_running; | ||
3342 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3343 | sgs->sum_nr_running; | ||
3344 | } | ||
3262 | 3345 | ||
3263 | load = target_load(i, load_idx); | 3346 | /* |
3264 | } else { | 3347 | * Calculate the group which is almost near its |
3265 | load = source_load(i, load_idx); | 3348 | * capacity but still has some space to pick up some load |
3266 | if (load > max_cpu_load) | 3349 | * from other group and save more power |
3267 | max_cpu_load = load; | 3350 | */ |
3268 | if (min_cpu_load > load) | 3351 | if (sgs->sum_nr_running > sgs->group_capacity - 1) |
3269 | min_cpu_load = load; | 3352 | return; |
3270 | } | ||
3271 | 3353 | ||
3272 | avg_load += load; | 3354 | if (sgs->sum_nr_running > sds->leader_nr_running || |
3273 | sum_nr_running += rq->nr_running; | 3355 | (sgs->sum_nr_running == sds->leader_nr_running && |
3274 | sum_weighted_load += weighted_cpuload(i); | 3356 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { |
3357 | sds->group_leader = group; | ||
3358 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3359 | } | ||
3360 | } | ||
3275 | 3361 | ||
3276 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | 3362 | /** |
3277 | } | 3363 | * check_power_save_busiest_group - Check if we have potential to perform |
3364 | * some power-savings balance. If yes, set the busiest group to be | ||
3365 | * the least loaded group in the sched_domain, so that it's CPUs can | ||
3366 | * be put to idle. | ||
3367 | * | ||
3368 | * @sds: Variable containing the statistics of the sched_domain | ||
3369 | * under consideration. | ||
3370 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3371 | * @imbalance: Variable to store the imbalance. | ||
3372 | * | ||
3373 | * Returns 1 if there is potential to perform power-savings balance. | ||
3374 | * Else returns 0. | ||
3375 | */ | ||
3376 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3377 | int this_cpu, unsigned long *imbalance) | ||
3378 | { | ||
3379 | if (!sds->power_savings_balance) | ||
3380 | return 0; | ||
3278 | 3381 | ||
3279 | /* | 3382 | if (sds->this != sds->group_leader || |
3280 | * First idle cpu or the first cpu(busiest) in this sched group | 3383 | sds->group_leader == sds->group_min) |
3281 | * is eligible for doing load balancing at this and above | 3384 | return 0; |
3282 | * domains. In the newly idle case, we will allow all the cpu's | ||
3283 | * to do the newly idle load balance. | ||
3284 | */ | ||
3285 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
3286 | balance_cpu != this_cpu && balance) { | ||
3287 | *balance = 0; | ||
3288 | goto ret; | ||
3289 | } | ||
3290 | 3385 | ||
3291 | total_load += avg_load; | 3386 | *imbalance = sds->min_load_per_task; |
3292 | total_pwr += group->__cpu_power; | 3387 | sds->busiest = sds->group_min; |
3293 | 3388 | ||
3294 | /* Adjust by relative CPU power of the group */ | 3389 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { |
3295 | avg_load = sg_div_cpu_power(group, | 3390 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = |
3296 | avg_load * SCHED_LOAD_SCALE); | 3391 | group_first_cpu(sds->group_leader); |
3392 | } | ||
3297 | 3393 | ||
3394 | return 1; | ||
3298 | 3395 | ||
3299 | /* | 3396 | } |
3300 | * Consider the group unbalanced when the imbalance is larger | 3397 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3301 | * than the average weight of two tasks. | 3398 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, |
3302 | * | 3399 | struct sd_lb_stats *sds, enum cpu_idle_type idle) |
3303 | * APZ: with cgroup the avg task weight can vary wildly and | 3400 | { |
3304 | * might not be a suitable number - should we keep a | 3401 | return; |
3305 | * normalized nr_running number somewhere that negates | 3402 | } |
3306 | * the hierarchy? | 3403 | |
3307 | */ | 3404 | static inline void update_sd_power_savings_stats(struct sched_group *group, |
3308 | avg_load_per_task = sg_div_cpu_power(group, | 3405 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) |
3309 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3406 | { |
3407 | return; | ||
3408 | } | ||
3409 | |||
3410 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3411 | int this_cpu, unsigned long *imbalance) | ||
3412 | { | ||
3413 | return 0; | ||
3414 | } | ||
3415 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3416 | |||
3417 | |||
3418 | /** | ||
3419 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
3420 | * @group: sched_group whose statistics are to be updated. | ||
3421 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3422 | * @idle: Idle status of this_cpu | ||
3423 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
3424 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3425 | * @local_group: Does group contain this_cpu. | ||
3426 | * @cpus: Set of cpus considered for load balancing. | ||
3427 | * @balance: Should we balance. | ||
3428 | * @sgs: variable to hold the statistics for this group. | ||
3429 | */ | ||
3430 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | ||
3431 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
3432 | int local_group, const struct cpumask *cpus, | ||
3433 | int *balance, struct sg_lb_stats *sgs) | ||
3434 | { | ||
3435 | unsigned long load, max_cpu_load, min_cpu_load; | ||
3436 | int i; | ||
3437 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
3438 | unsigned long sum_avg_load_per_task; | ||
3439 | unsigned long avg_load_per_task; | ||
3310 | 3440 | ||
3311 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3441 | if (local_group) |
3312 | __group_imb = 1; | 3442 | balance_cpu = group_first_cpu(group); |
3313 | 3443 | ||
3314 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3444 | /* Tally up the load of all CPUs in the group */ |
3445 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3446 | max_cpu_load = 0; | ||
3447 | min_cpu_load = ~0UL; | ||
3315 | 3448 | ||
3449 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
3450 | struct rq *rq = cpu_rq(i); | ||
3451 | |||
3452 | if (*sd_idle && rq->nr_running) | ||
3453 | *sd_idle = 0; | ||
3454 | |||
3455 | /* Bias balancing toward cpus of our domain */ | ||
3316 | if (local_group) { | 3456 | if (local_group) { |
3317 | this_load = avg_load; | 3457 | if (idle_cpu(i) && !first_idle_cpu) { |
3318 | this = group; | 3458 | first_idle_cpu = 1; |
3319 | this_nr_running = sum_nr_running; | 3459 | balance_cpu = i; |
3320 | this_load_per_task = sum_weighted_load; | 3460 | } |
3321 | } else if (avg_load > max_load && | 3461 | |
3322 | (sum_nr_running > group_capacity || __group_imb)) { | 3462 | load = target_load(i, load_idx); |
3323 | max_load = avg_load; | 3463 | } else { |
3324 | busiest = group; | 3464 | load = source_load(i, load_idx); |
3325 | busiest_nr_running = sum_nr_running; | 3465 | if (load > max_cpu_load) |
3326 | busiest_load_per_task = sum_weighted_load; | 3466 | max_cpu_load = load; |
3327 | group_imb = __group_imb; | 3467 | if (min_cpu_load > load) |
3468 | min_cpu_load = load; | ||
3328 | } | 3469 | } |
3329 | 3470 | ||
3330 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 3471 | sgs->group_load += load; |
3331 | /* | 3472 | sgs->sum_nr_running += rq->nr_running; |
3332 | * Busy processors will not participate in power savings | 3473 | sgs->sum_weighted_load += weighted_cpuload(i); |
3333 | * balance. | ||
3334 | */ | ||
3335 | if (idle == CPU_NOT_IDLE || | ||
3336 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3337 | goto group_next; | ||
3338 | 3474 | ||
3339 | /* | 3475 | sum_avg_load_per_task += cpu_avg_load_per_task(i); |
3340 | * If the local group is idle or completely loaded | 3476 | } |
3341 | * no need to do power savings balance at this domain | ||
3342 | */ | ||
3343 | if (local_group && (this_nr_running >= group_capacity || | ||
3344 | !this_nr_running)) | ||
3345 | power_savings_balance = 0; | ||
3346 | 3477 | ||
3347 | /* | 3478 | /* |
3348 | * If a group is already running at full capacity or idle, | 3479 | * First idle cpu or the first cpu(busiest) in this sched group |
3349 | * don't include that group in power savings calculations | 3480 | * is eligible for doing load balancing at this and above |
3350 | */ | 3481 | * domains. In the newly idle case, we will allow all the cpu's |
3351 | if (!power_savings_balance || sum_nr_running >= group_capacity | 3482 | * to do the newly idle load balance. |
3352 | || !sum_nr_running) | 3483 | */ |
3353 | goto group_next; | 3484 | if (idle != CPU_NEWLY_IDLE && local_group && |
3485 | balance_cpu != this_cpu && balance) { | ||
3486 | *balance = 0; | ||
3487 | return; | ||
3488 | } | ||
3354 | 3489 | ||
3355 | /* | 3490 | /* Adjust by relative CPU power of the group */ |
3356 | * Calculate the group which has the least non-idle load. | 3491 | sgs->avg_load = sg_div_cpu_power(group, |
3357 | * This is the group from where we need to pick up the load | 3492 | sgs->group_load * SCHED_LOAD_SCALE); |
3358 | * for saving power | ||
3359 | */ | ||
3360 | if ((sum_nr_running < min_nr_running) || | ||
3361 | (sum_nr_running == min_nr_running && | ||
3362 | cpumask_first(sched_group_cpus(group)) > | ||
3363 | cpumask_first(sched_group_cpus(group_min)))) { | ||
3364 | group_min = group; | ||
3365 | min_nr_running = sum_nr_running; | ||
3366 | min_load_per_task = sum_weighted_load / | ||
3367 | sum_nr_running; | ||
3368 | } | ||
3369 | 3493 | ||
3370 | /* | 3494 | |
3371 | * Calculate the group which is almost near its | 3495 | /* |
3372 | * capacity but still has some space to pick up some load | 3496 | * Consider the group unbalanced when the imbalance is larger |
3373 | * from other group and save more power | 3497 | * than the average weight of two tasks. |
3374 | */ | 3498 | * |
3375 | if (sum_nr_running <= group_capacity - 1) { | 3499 | * APZ: with cgroup the avg task weight can vary wildly and |
3376 | if (sum_nr_running > leader_nr_running || | 3500 | * might not be a suitable number - should we keep a |
3377 | (sum_nr_running == leader_nr_running && | 3501 | * normalized nr_running number somewhere that negates |
3378 | cpumask_first(sched_group_cpus(group)) < | 3502 | * the hierarchy? |
3379 | cpumask_first(sched_group_cpus(group_leader)))) { | 3503 | */ |
3380 | group_leader = group; | 3504 | avg_load_per_task = sg_div_cpu_power(group, |
3381 | leader_nr_running = sum_nr_running; | 3505 | sum_avg_load_per_task * SCHED_LOAD_SCALE); |
3382 | } | 3506 | |
3507 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3508 | sgs->group_imb = 1; | ||
3509 | |||
3510 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | ||
3511 | |||
3512 | } | ||
3513 | |||
3514 | /** | ||
3515 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
3516 | * @sd: sched_domain whose statistics are to be updated. | ||
3517 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3518 | * @idle: Idle status of this_cpu | ||
3519 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3520 | * @cpus: Set of cpus considered for load balancing. | ||
3521 | * @balance: Should we balance. | ||
3522 | * @sds: variable to hold the statistics for this sched_domain. | ||
3523 | */ | ||
3524 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
3525 | enum cpu_idle_type idle, int *sd_idle, | ||
3526 | const struct cpumask *cpus, int *balance, | ||
3527 | struct sd_lb_stats *sds) | ||
3528 | { | ||
3529 | struct sched_group *group = sd->groups; | ||
3530 | struct sg_lb_stats sgs; | ||
3531 | int load_idx; | ||
3532 | |||
3533 | init_sd_power_savings_stats(sd, sds, idle); | ||
3534 | load_idx = get_sd_load_idx(sd, idle); | ||
3535 | |||
3536 | do { | ||
3537 | int local_group; | ||
3538 | |||
3539 | local_group = cpumask_test_cpu(this_cpu, | ||
3540 | sched_group_cpus(group)); | ||
3541 | memset(&sgs, 0, sizeof(sgs)); | ||
3542 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | ||
3543 | local_group, cpus, balance, &sgs); | ||
3544 | |||
3545 | if (local_group && balance && !(*balance)) | ||
3546 | return; | ||
3547 | |||
3548 | sds->total_load += sgs.group_load; | ||
3549 | sds->total_pwr += group->__cpu_power; | ||
3550 | |||
3551 | if (local_group) { | ||
3552 | sds->this_load = sgs.avg_load; | ||
3553 | sds->this = group; | ||
3554 | sds->this_nr_running = sgs.sum_nr_running; | ||
3555 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
3556 | } else if (sgs.avg_load > sds->max_load && | ||
3557 | (sgs.sum_nr_running > sgs.group_capacity || | ||
3558 | sgs.group_imb)) { | ||
3559 | sds->max_load = sgs.avg_load; | ||
3560 | sds->busiest = group; | ||
3561 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
3562 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
3563 | sds->group_imb = sgs.group_imb; | ||
3383 | } | 3564 | } |
3384 | group_next: | 3565 | |
3385 | #endif | 3566 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3386 | group = group->next; | 3567 | group = group->next; |
3387 | } while (group != sd->groups); | 3568 | } while (group != sd->groups); |
3388 | 3569 | ||
3389 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) | 3570 | } |
3390 | goto out_balanced; | ||
3391 | |||
3392 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | ||
3393 | 3571 | ||
3394 | if (this_load >= avg_load || | 3572 | /** |
3395 | 100*max_load <= sd->imbalance_pct*this_load) | 3573 | * fix_small_imbalance - Calculate the minor imbalance that exists |
3396 | goto out_balanced; | 3574 | * amongst the groups of a sched_domain, during |
3575 | * load balancing. | ||
3576 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
3577 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3578 | * @imbalance: Variable to store the imbalance. | ||
3579 | */ | ||
3580 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
3581 | int this_cpu, unsigned long *imbalance) | ||
3582 | { | ||
3583 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
3584 | unsigned int imbn = 2; | ||
3585 | |||
3586 | if (sds->this_nr_running) { | ||
3587 | sds->this_load_per_task /= sds->this_nr_running; | ||
3588 | if (sds->busiest_load_per_task > | ||
3589 | sds->this_load_per_task) | ||
3590 | imbn = 1; | ||
3591 | } else | ||
3592 | sds->this_load_per_task = | ||
3593 | cpu_avg_load_per_task(this_cpu); | ||
3397 | 3594 | ||
3398 | busiest_load_per_task /= busiest_nr_running; | 3595 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= |
3399 | if (group_imb) | 3596 | sds->busiest_load_per_task * imbn) { |
3400 | busiest_load_per_task = min(busiest_load_per_task, avg_load); | 3597 | *imbalance = sds->busiest_load_per_task; |
3598 | return; | ||
3599 | } | ||
3401 | 3600 | ||
3402 | /* | 3601 | /* |
3403 | * We're trying to get all the cpus to the average_load, so we don't | 3602 | * OK, we don't have enough imbalance to justify moving tasks, |
3404 | * want to push ourselves above the average load, nor do we wish to | 3603 | * however we may be able to increase total CPU power used by |
3405 | * reduce the max loaded cpu below the average load, as either of these | 3604 | * moving them. |
3406 | * actions would just result in more rebalancing later, and ping-pong | ||
3407 | * tasks around. Thus we look for the minimum possible imbalance. | ||
3408 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
3409 | * be counted as no imbalance for these purposes -- we can't fix that | ||
3410 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
3411 | * appear as very large values with unsigned longs. | ||
3412 | */ | 3605 | */ |
3413 | if (max_load <= busiest_load_per_task) | ||
3414 | goto out_balanced; | ||
3415 | 3606 | ||
3607 | pwr_now += sds->busiest->__cpu_power * | ||
3608 | min(sds->busiest_load_per_task, sds->max_load); | ||
3609 | pwr_now += sds->this->__cpu_power * | ||
3610 | min(sds->this_load_per_task, sds->this_load); | ||
3611 | pwr_now /= SCHED_LOAD_SCALE; | ||
3612 | |||
3613 | /* Amount of load we'd subtract */ | ||
3614 | tmp = sg_div_cpu_power(sds->busiest, | ||
3615 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | ||
3616 | if (sds->max_load > tmp) | ||
3617 | pwr_move += sds->busiest->__cpu_power * | ||
3618 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
3619 | |||
3620 | /* Amount of load we'd add */ | ||
3621 | if (sds->max_load * sds->busiest->__cpu_power < | ||
3622 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
3623 | tmp = sg_div_cpu_power(sds->this, | ||
3624 | sds->max_load * sds->busiest->__cpu_power); | ||
3625 | else | ||
3626 | tmp = sg_div_cpu_power(sds->this, | ||
3627 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | ||
3628 | pwr_move += sds->this->__cpu_power * | ||
3629 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
3630 | pwr_move /= SCHED_LOAD_SCALE; | ||
3631 | |||
3632 | /* Move if we gain throughput */ | ||
3633 | if (pwr_move > pwr_now) | ||
3634 | *imbalance = sds->busiest_load_per_task; | ||
3635 | } | ||
3636 | |||
3637 | /** | ||
3638 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
3639 | * groups of a given sched_domain during load balance. | ||
3640 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
3641 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
3642 | * @imbalance: The variable to store the imbalance. | ||
3643 | */ | ||
3644 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
3645 | unsigned long *imbalance) | ||
3646 | { | ||
3647 | unsigned long max_pull; | ||
3416 | /* | 3648 | /* |
3417 | * In the presence of smp nice balancing, certain scenarios can have | 3649 | * In the presence of smp nice balancing, certain scenarios can have |
3418 | * max load less than avg load(as we skip the groups at or below | 3650 | * max load less than avg load(as we skip the groups at or below |
3419 | * its cpu_power, while calculating max_load..) | 3651 | * its cpu_power, while calculating max_load..) |
3420 | */ | 3652 | */ |
3421 | if (max_load < avg_load) { | 3653 | if (sds->max_load < sds->avg_load) { |
3422 | *imbalance = 0; | 3654 | *imbalance = 0; |
3423 | goto small_imbalance; | 3655 | return fix_small_imbalance(sds, this_cpu, imbalance); |
3424 | } | 3656 | } |
3425 | 3657 | ||
3426 | /* Don't want to pull so many tasks that a group would go idle */ | 3658 | /* Don't want to pull so many tasks that a group would go idle */ |
3427 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 3659 | max_pull = min(sds->max_load - sds->avg_load, |
3660 | sds->max_load - sds->busiest_load_per_task); | ||
3428 | 3661 | ||
3429 | /* How much load to actually move to equalise the imbalance */ | 3662 | /* How much load to actually move to equalise the imbalance */ |
3430 | *imbalance = min(max_pull * busiest->__cpu_power, | 3663 | *imbalance = min(max_pull * sds->busiest->__cpu_power, |
3431 | (avg_load - this_load) * this->__cpu_power) | 3664 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) |
3432 | / SCHED_LOAD_SCALE; | 3665 | / SCHED_LOAD_SCALE; |
3433 | 3666 | ||
3434 | /* | 3667 | /* |
@@ -3437,78 +3670,110 @@ group_next: | |||
3437 | * a think about bumping its value to force at least one task to be | 3670 | * a think about bumping its value to force at least one task to be |
3438 | * moved | 3671 | * moved |
3439 | */ | 3672 | */ |
3440 | if (*imbalance < busiest_load_per_task) { | 3673 | if (*imbalance < sds->busiest_load_per_task) |
3441 | unsigned long tmp, pwr_now, pwr_move; | 3674 | return fix_small_imbalance(sds, this_cpu, imbalance); |
3442 | unsigned int imbn; | ||
3443 | |||
3444 | small_imbalance: | ||
3445 | pwr_move = pwr_now = 0; | ||
3446 | imbn = 2; | ||
3447 | if (this_nr_running) { | ||
3448 | this_load_per_task /= this_nr_running; | ||
3449 | if (busiest_load_per_task > this_load_per_task) | ||
3450 | imbn = 1; | ||
3451 | } else | ||
3452 | this_load_per_task = cpu_avg_load_per_task(this_cpu); | ||
3453 | 3675 | ||
3454 | if (max_load - this_load + busiest_load_per_task >= | 3676 | } |
3455 | busiest_load_per_task * imbn) { | 3677 | /******* find_busiest_group() helpers end here *********************/ |
3456 | *imbalance = busiest_load_per_task; | ||
3457 | return busiest; | ||
3458 | } | ||
3459 | 3678 | ||
3460 | /* | 3679 | /** |
3461 | * OK, we don't have enough imbalance to justify moving tasks, | 3680 | * find_busiest_group - Returns the busiest group within the sched_domain |
3462 | * however we may be able to increase total CPU power used by | 3681 | * if there is an imbalance. If there isn't an imbalance, and |
3463 | * moving them. | 3682 | * the user has opted for power-savings, it returns a group whose |
3464 | */ | 3683 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if |
3684 | * such a group exists. | ||
3685 | * | ||
3686 | * Also calculates the amount of weighted load which should be moved | ||
3687 | * to restore balance. | ||
3688 | * | ||
3689 | * @sd: The sched_domain whose busiest group is to be returned. | ||
3690 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
3691 | * @imbalance: Variable which stores amount of weighted load which should | ||
3692 | * be moved to restore balance/put a group to idle. | ||
3693 | * @idle: The idle status of this_cpu. | ||
3694 | * @sd_idle: The idleness of sd | ||
3695 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
3696 | * @balance: Pointer to a variable indicating if this_cpu | ||
3697 | * is the appropriate cpu to perform load balancing at this_level. | ||
3698 | * | ||
3699 | * Returns: - the busiest group if imbalance exists. | ||
3700 | * - If no imbalance and user has opted for power-savings balance, | ||
3701 | * return the least loaded group whose CPUs can be | ||
3702 | * put to idle by rebalancing its tasks onto our group. | ||
3703 | */ | ||
3704 | static struct sched_group * | ||
3705 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
3706 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
3707 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
3708 | { | ||
3709 | struct sd_lb_stats sds; | ||
3465 | 3710 | ||
3466 | pwr_now += busiest->__cpu_power * | 3711 | memset(&sds, 0, sizeof(sds)); |
3467 | min(busiest_load_per_task, max_load); | ||
3468 | pwr_now += this->__cpu_power * | ||
3469 | min(this_load_per_task, this_load); | ||
3470 | pwr_now /= SCHED_LOAD_SCALE; | ||
3471 | |||
3472 | /* Amount of load we'd subtract */ | ||
3473 | tmp = sg_div_cpu_power(busiest, | ||
3474 | busiest_load_per_task * SCHED_LOAD_SCALE); | ||
3475 | if (max_load > tmp) | ||
3476 | pwr_move += busiest->__cpu_power * | ||
3477 | min(busiest_load_per_task, max_load - tmp); | ||
3478 | |||
3479 | /* Amount of load we'd add */ | ||
3480 | if (max_load * busiest->__cpu_power < | ||
3481 | busiest_load_per_task * SCHED_LOAD_SCALE) | ||
3482 | tmp = sg_div_cpu_power(this, | ||
3483 | max_load * busiest->__cpu_power); | ||
3484 | else | ||
3485 | tmp = sg_div_cpu_power(this, | ||
3486 | busiest_load_per_task * SCHED_LOAD_SCALE); | ||
3487 | pwr_move += this->__cpu_power * | ||
3488 | min(this_load_per_task, this_load + tmp); | ||
3489 | pwr_move /= SCHED_LOAD_SCALE; | ||
3490 | 3712 | ||
3491 | /* Move if we gain throughput */ | 3713 | /* |
3492 | if (pwr_move > pwr_now) | 3714 | * Compute the various statistics relavent for load balancing at |
3493 | *imbalance = busiest_load_per_task; | 3715 | * this level. |
3494 | } | 3716 | */ |
3717 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
3718 | balance, &sds); | ||
3719 | |||
3720 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
3721 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
3722 | * at this level. | ||
3723 | * 2) There is no busy sibling group to pull from. | ||
3724 | * 3) This group is the busiest group. | ||
3725 | * 4) This group is more busy than the avg busieness at this | ||
3726 | * sched_domain. | ||
3727 | * 5) The imbalance is within the specified limit. | ||
3728 | * 6) Any rebalance would lead to ping-pong | ||
3729 | */ | ||
3730 | if (balance && !(*balance)) | ||
3731 | goto ret; | ||
3495 | 3732 | ||
3496 | return busiest; | 3733 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3734 | goto out_balanced; | ||
3497 | 3735 | ||
3498 | out_balanced: | 3736 | if (sds.this_load >= sds.max_load) |
3499 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 3737 | goto out_balanced; |
3500 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3501 | goto ret; | ||
3502 | 3738 | ||
3503 | if (this == group_leader && group_leader != group_min) { | 3739 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
3504 | *imbalance = min_load_per_task; | 3740 | |
3505 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | 3741 | if (sds.this_load >= sds.avg_load) |
3506 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | 3742 | goto out_balanced; |
3507 | cpumask_first(sched_group_cpus(group_leader)); | 3743 | |
3508 | } | 3744 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
3509 | return group_min; | 3745 | goto out_balanced; |
3510 | } | 3746 | |
3511 | #endif | 3747 | sds.busiest_load_per_task /= sds.busiest_nr_running; |
3748 | if (sds.group_imb) | ||
3749 | sds.busiest_load_per_task = | ||
3750 | min(sds.busiest_load_per_task, sds.avg_load); | ||
3751 | |||
3752 | /* | ||
3753 | * We're trying to get all the cpus to the average_load, so we don't | ||
3754 | * want to push ourselves above the average load, nor do we wish to | ||
3755 | * reduce the max loaded cpu below the average load, as either of these | ||
3756 | * actions would just result in more rebalancing later, and ping-pong | ||
3757 | * tasks around. Thus we look for the minimum possible imbalance. | ||
3758 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
3759 | * be counted as no imbalance for these purposes -- we can't fix that | ||
3760 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
3761 | * appear as very large values with unsigned longs. | ||
3762 | */ | ||
3763 | if (sds.max_load <= sds.busiest_load_per_task) | ||
3764 | goto out_balanced; | ||
3765 | |||
3766 | /* Looks like there is an imbalance. Compute it */ | ||
3767 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
3768 | return sds.busiest; | ||
3769 | |||
3770 | out_balanced: | ||
3771 | /* | ||
3772 | * There is no obvious imbalance. But check if we can do some balancing | ||
3773 | * to save power. | ||
3774 | */ | ||
3775 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
3776 | return sds.busiest; | ||
3512 | ret: | 3777 | ret: |
3513 | *imbalance = 0; | 3778 | *imbalance = 0; |
3514 | return NULL; | 3779 | return NULL; |