diff options
| author | Ingo Molnar <mingo@elte.hu> | 2009-03-26 10:25:24 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2009-03-26 10:25:24 -0400 |
| commit | 66fef08f7d5267b2312c4b48a6d2957d2d414105 (patch) | |
| tree | ee6fb1ce3354c8a06c7c1b08b2c386aaebf07a33 | |
| parent | b6d9842258d1ba27fb978cded74eb4b6aa15edc8 (diff) | |
| parent | b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 (diff) | |
Merge branch 'sched/balancing' into sched/core
| -rw-r--r-- | kernel/sched.c | 765 |
1 files changed, 515 insertions, 250 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 7b389c74f8ff..9f8506d68fdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -3189,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 3189 | 3189 | ||
| 3190 | return 0; | 3190 | return 0; |
| 3191 | } | 3191 | } |
| 3192 | /********** Helpers for find_busiest_group ************************/ | ||
| 3193 | /** | ||
| 3194 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 3195 | * during load balancing. | ||
| 3196 | */ | ||
| 3197 | struct sd_lb_stats { | ||
| 3198 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 3199 | struct sched_group *this; /* Local group in this sd */ | ||
| 3200 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 3201 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 3202 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 3203 | |||
| 3204 | /** Statistics of this group */ | ||
| 3205 | unsigned long this_load; | ||
| 3206 | unsigned long this_load_per_task; | ||
| 3207 | unsigned long this_nr_running; | ||
| 3208 | |||
| 3209 | /* Statistics of the busiest group */ | ||
| 3210 | unsigned long max_load; | ||
| 3211 | unsigned long busiest_load_per_task; | ||
| 3212 | unsigned long busiest_nr_running; | ||
| 3213 | |||
| 3214 | int group_imb; /* Is there imbalance in this sd */ | ||
| 3215 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 3216 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
| 3217 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
| 3218 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
| 3219 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
| 3220 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
| 3221 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
| 3222 | #endif | ||
| 3223 | }; | ||
| 3192 | 3224 | ||
| 3193 | /* | 3225 | /** |
| 3194 | * find_busiest_group finds and returns the busiest CPU group within the | 3226 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 3195 | * domain. It calculates and returns the amount of weighted load which | 3227 | */ |
| 3196 | * should be moved to restore balance via the imbalance parameter. | 3228 | struct sg_lb_stats { |
| 3229 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
| 3230 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
| 3231 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
| 3232 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
| 3233 | unsigned long group_capacity; | ||
| 3234 | int group_imb; /* Is there an imbalance in the group ? */ | ||
| 3235 | }; | ||
| 3236 | |||
| 3237 | /** | ||
| 3238 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
| 3239 | * @group: The group whose first cpu is to be returned. | ||
| 3197 | */ | 3240 | */ |
| 3198 | static struct sched_group * | 3241 | static inline unsigned int group_first_cpu(struct sched_group *group) |
| 3199 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
| 3200 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
| 3201 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
| 3202 | { | 3242 | { |
| 3203 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3243 | return cpumask_first(sched_group_cpus(group)); |
| 3204 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3244 | } |
| 3205 | unsigned long max_pull; | ||
| 3206 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
| 3207 | unsigned long this_load_per_task, this_nr_running; | ||
| 3208 | int load_idx, group_imb = 0; | ||
| 3209 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 3210 | int power_savings_balance = 1; | ||
| 3211 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
| 3212 | unsigned long min_nr_running = ULONG_MAX; | ||
| 3213 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
| 3214 | #endif | ||
| 3215 | 3245 | ||
| 3216 | max_load = this_load = total_load = total_pwr = 0; | 3246 | /** |
| 3217 | busiest_load_per_task = busiest_nr_running = 0; | 3247 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 3218 | this_load_per_task = this_nr_running = 0; | 3248 | * @sd: The sched_domain whose load_idx is to be obtained. |
| 3249 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
| 3250 | */ | ||
| 3251 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
| 3252 | enum cpu_idle_type idle) | ||
| 3253 | { | ||
| 3254 | int load_idx; | ||
| 3219 | 3255 | ||
| 3220 | if (idle == CPU_NOT_IDLE) | 3256 | switch (idle) { |
| 3257 | case CPU_NOT_IDLE: | ||
| 3221 | load_idx = sd->busy_idx; | 3258 | load_idx = sd->busy_idx; |
| 3222 | else if (idle == CPU_NEWLY_IDLE) | 3259 | break; |
| 3260 | |||
| 3261 | case CPU_NEWLY_IDLE: | ||
| 3223 | load_idx = sd->newidle_idx; | 3262 | load_idx = sd->newidle_idx; |
| 3224 | else | 3263 | break; |
| 3264 | default: | ||
| 3225 | load_idx = sd->idle_idx; | 3265 | load_idx = sd->idle_idx; |
| 3266 | break; | ||
| 3267 | } | ||
| 3226 | 3268 | ||
| 3227 | do { | 3269 | return load_idx; |
| 3228 | unsigned long load, group_capacity, max_cpu_load, min_cpu_load; | 3270 | } |
| 3229 | int local_group; | ||
| 3230 | int i; | ||
| 3231 | int __group_imb = 0; | ||
| 3232 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 3233 | unsigned long sum_nr_running, sum_weighted_load; | ||
| 3234 | unsigned long sum_avg_load_per_task; | ||
| 3235 | unsigned long avg_load_per_task; | ||
| 3236 | 3271 | ||
| 3237 | local_group = cpumask_test_cpu(this_cpu, | ||
| 3238 | sched_group_cpus(group)); | ||
| 3239 | 3272 | ||
| 3240 | if (local_group) | 3273 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| 3241 | balance_cpu = cpumask_first(sched_group_cpus(group)); | 3274 | /** |
| 3275 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
| 3276 | * the given sched_domain, during load balancing. | ||
| 3277 | * | ||
| 3278 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
| 3279 | * @sds: Variable containing the statistics for sd. | ||
| 3280 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
| 3281 | */ | ||
| 3282 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
| 3283 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
| 3284 | { | ||
| 3285 | /* | ||
| 3286 | * Busy processors will not participate in power savings | ||
| 3287 | * balance. | ||
| 3288 | */ | ||
| 3289 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 3290 | sds->power_savings_balance = 0; | ||
| 3291 | else { | ||
| 3292 | sds->power_savings_balance = 1; | ||
| 3293 | sds->min_nr_running = ULONG_MAX; | ||
| 3294 | sds->leader_nr_running = 0; | ||
| 3295 | } | ||
| 3296 | } | ||
| 3242 | 3297 | ||
| 3243 | /* Tally up the load of all CPUs in the group */ | 3298 | /** |
| 3244 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3299 | * update_sd_power_savings_stats - Update the power saving stats for a |
| 3245 | sum_avg_load_per_task = avg_load_per_task = 0; | 3300 | * sched_domain while performing load balancing. |
| 3301 | * | ||
| 3302 | * @group: sched_group belonging to the sched_domain under consideration. | ||
| 3303 | * @sds: Variable containing the statistics of the sched_domain | ||
| 3304 | * @local_group: Does group contain the CPU for which we're performing | ||
| 3305 | * load balancing ? | ||
| 3306 | * @sgs: Variable containing the statistics of the group. | ||
| 3307 | */ | ||
| 3308 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
| 3309 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
| 3310 | { | ||
| 3246 | 3311 | ||
| 3247 | max_cpu_load = 0; | 3312 | if (!sds->power_savings_balance) |
| 3248 | min_cpu_load = ~0UL; | 3313 | return; |
| 3249 | 3314 | ||
| 3250 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3315 | /* |
| 3251 | struct rq *rq = cpu_rq(i); | 3316 | * If the local group is idle or completely loaded |
| 3317 | * no need to do power savings balance at this domain | ||
| 3318 | */ | ||
| 3319 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
| 3320 | !sds->this_nr_running)) | ||
| 3321 | sds->power_savings_balance = 0; | ||
| 3252 | 3322 | ||
| 3253 | if (*sd_idle && rq->nr_running) | 3323 | /* |
| 3254 | *sd_idle = 0; | 3324 | * If a group is already running at full capacity or idle, |
| 3325 | * don't include that group in power savings calculations | ||
| 3326 | */ | ||
| 3327 | if (!sds->power_savings_balance || | ||
| 3328 | sgs->sum_nr_running >= sgs->group_capacity || | ||
| 3329 | !sgs->sum_nr_running) | ||
| 3330 | return; | ||
| 3255 | 3331 | ||
| 3256 | /* Bias balancing toward cpus of our domain */ | 3332 | /* |
| 3257 | if (local_group) { | 3333 | * Calculate the group which has the least non-idle load. |
| 3258 | if (idle_cpu(i) && !first_idle_cpu) { | 3334 | * This is the group from where we need to pick up the load |
| 3259 | first_idle_cpu = 1; | 3335 | * for saving power |
| 3260 | balance_cpu = i; | 3336 | */ |
| 3261 | } | 3337 | if ((sgs->sum_nr_running < sds->min_nr_running) || |
| 3338 | (sgs->sum_nr_running == sds->min_nr_running && | ||
| 3339 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
| 3340 | sds->group_min = group; | ||
| 3341 | sds->min_nr_running = sgs->sum_nr_running; | ||
| 3342 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
| 3343 | sgs->sum_nr_running; | ||
| 3344 | } | ||
| 3262 | 3345 | ||
| 3263 | load = target_load(i, load_idx); | 3346 | /* |
| 3264 | } else { | 3347 | * Calculate the group which is almost near its |
| 3265 | load = source_load(i, load_idx); | 3348 | * capacity but still has some space to pick up some load |
| 3266 | if (load > max_cpu_load) | 3349 | * from other group and save more power |
| 3267 | max_cpu_load = load; | 3350 | */ |
| 3268 | if (min_cpu_load > load) | 3351 | if (sgs->sum_nr_running > sgs->group_capacity - 1) |
| 3269 | min_cpu_load = load; | 3352 | return; |
| 3270 | } | ||
| 3271 | 3353 | ||
| 3272 | avg_load += load; | 3354 | if (sgs->sum_nr_running > sds->leader_nr_running || |
| 3273 | sum_nr_running += rq->nr_running; | 3355 | (sgs->sum_nr_running == sds->leader_nr_running && |
| 3274 | sum_weighted_load += weighted_cpuload(i); | 3356 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { |
| 3357 | sds->group_leader = group; | ||
| 3358 | sds->leader_nr_running = sgs->sum_nr_running; | ||
| 3359 | } | ||
| 3360 | } | ||
| 3275 | 3361 | ||
| 3276 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | 3362 | /** |
| 3277 | } | 3363 | * check_power_save_busiest_group - Check if we have potential to perform |
| 3364 | * some power-savings balance. If yes, set the busiest group to be | ||
| 3365 | * the least loaded group in the sched_domain, so that it's CPUs can | ||
| 3366 | * be put to idle. | ||
| 3367 | * | ||
| 3368 | * @sds: Variable containing the statistics of the sched_domain | ||
| 3369 | * under consideration. | ||
| 3370 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
| 3371 | * @imbalance: Variable to store the imbalance. | ||
| 3372 | * | ||
| 3373 | * Returns 1 if there is potential to perform power-savings balance. | ||
| 3374 | * Else returns 0. | ||
| 3375 | */ | ||
| 3376 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 3377 | int this_cpu, unsigned long *imbalance) | ||
| 3378 | { | ||
| 3379 | if (!sds->power_savings_balance) | ||
| 3380 | return 0; | ||
| 3278 | 3381 | ||
| 3279 | /* | 3382 | if (sds->this != sds->group_leader || |
| 3280 | * First idle cpu or the first cpu(busiest) in this sched group | 3383 | sds->group_leader == sds->group_min) |
| 3281 | * is eligible for doing load balancing at this and above | 3384 | return 0; |
| 3282 | * domains. In the newly idle case, we will allow all the cpu's | ||
| 3283 | * to do the newly idle load balance. | ||
| 3284 | */ | ||
| 3285 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
| 3286 | balance_cpu != this_cpu && balance) { | ||
| 3287 | *balance = 0; | ||
| 3288 | goto ret; | ||
| 3289 | } | ||
| 3290 | 3385 | ||
| 3291 | total_load += avg_load; | 3386 | *imbalance = sds->min_load_per_task; |
| 3292 | total_pwr += group->__cpu_power; | 3387 | sds->busiest = sds->group_min; |
| 3293 | 3388 | ||
| 3294 | /* Adjust by relative CPU power of the group */ | 3389 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { |
| 3295 | avg_load = sg_div_cpu_power(group, | 3390 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = |
| 3296 | avg_load * SCHED_LOAD_SCALE); | 3391 | group_first_cpu(sds->group_leader); |
| 3392 | } | ||
| 3297 | 3393 | ||
| 3394 | return 1; | ||
| 3298 | 3395 | ||
| 3299 | /* | 3396 | } |
| 3300 | * Consider the group unbalanced when the imbalance is larger | 3397 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 3301 | * than the average weight of two tasks. | 3398 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, |
| 3302 | * | 3399 | struct sd_lb_stats *sds, enum cpu_idle_type idle) |
| 3303 | * APZ: with cgroup the avg task weight can vary wildly and | 3400 | { |
| 3304 | * might not be a suitable number - should we keep a | 3401 | return; |
| 3305 | * normalized nr_running number somewhere that negates | 3402 | } |
| 3306 | * the hierarchy? | 3403 | |
| 3307 | */ | 3404 | static inline void update_sd_power_savings_stats(struct sched_group *group, |
| 3308 | avg_load_per_task = sg_div_cpu_power(group, | 3405 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) |
| 3309 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3406 | { |
| 3407 | return; | ||
| 3408 | } | ||
| 3409 | |||
| 3410 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 3411 | int this_cpu, unsigned long *imbalance) | ||
| 3412 | { | ||
| 3413 | return 0; | ||
| 3414 | } | ||
| 3415 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
| 3416 | |||
| 3417 | |||
| 3418 | /** | ||
| 3419 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
| 3420 | * @group: sched_group whose statistics are to be updated. | ||
| 3421 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3422 | * @idle: Idle status of this_cpu | ||
| 3423 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
| 3424 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 3425 | * @local_group: Does group contain this_cpu. | ||
| 3426 | * @cpus: Set of cpus considered for load balancing. | ||
| 3427 | * @balance: Should we balance. | ||
| 3428 | * @sgs: variable to hold the statistics for this group. | ||
| 3429 | */ | ||
| 3430 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | ||
| 3431 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
| 3432 | int local_group, const struct cpumask *cpus, | ||
| 3433 | int *balance, struct sg_lb_stats *sgs) | ||
| 3434 | { | ||
| 3435 | unsigned long load, max_cpu_load, min_cpu_load; | ||
| 3436 | int i; | ||
| 3437 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 3438 | unsigned long sum_avg_load_per_task; | ||
| 3439 | unsigned long avg_load_per_task; | ||
| 3310 | 3440 | ||
| 3311 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3441 | if (local_group) |
| 3312 | __group_imb = 1; | 3442 | balance_cpu = group_first_cpu(group); |
| 3313 | 3443 | ||
| 3314 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3444 | /* Tally up the load of all CPUs in the group */ |
| 3445 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
| 3446 | max_cpu_load = 0; | ||
| 3447 | min_cpu_load = ~0UL; | ||
| 3315 | 3448 | ||
| 3449 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
| 3450 | struct rq *rq = cpu_rq(i); | ||
| 3451 | |||
| 3452 | if (*sd_idle && rq->nr_running) | ||
| 3453 | *sd_idle = 0; | ||
| 3454 | |||
| 3455 | /* Bias balancing toward cpus of our domain */ | ||
| 3316 | if (local_group) { | 3456 | if (local_group) { |
| 3317 | this_load = avg_load; | 3457 | if (idle_cpu(i) && !first_idle_cpu) { |
| 3318 | this = group; | 3458 | first_idle_cpu = 1; |
| 3319 | this_nr_running = sum_nr_running; | 3459 | balance_cpu = i; |
| 3320 | this_load_per_task = sum_weighted_load; | 3460 | } |
| 3321 | } else if (avg_load > max_load && | 3461 | |
| 3322 | (sum_nr_running > group_capacity || __group_imb)) { | 3462 | load = target_load(i, load_idx); |
| 3323 | max_load = avg_load; | 3463 | } else { |
| 3324 | busiest = group; | 3464 | load = source_load(i, load_idx); |
| 3325 | busiest_nr_running = sum_nr_running; | 3465 | if (load > max_cpu_load) |
| 3326 | busiest_load_per_task = sum_weighted_load; | 3466 | max_cpu_load = load; |
| 3327 | group_imb = __group_imb; | 3467 | if (min_cpu_load > load) |
| 3468 | min_cpu_load = load; | ||
| 3328 | } | 3469 | } |
| 3329 | 3470 | ||
| 3330 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 3471 | sgs->group_load += load; |
| 3331 | /* | 3472 | sgs->sum_nr_running += rq->nr_running; |
| 3332 | * Busy processors will not participate in power savings | 3473 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 3333 | * balance. | ||
| 3334 | */ | ||
| 3335 | if (idle == CPU_NOT_IDLE || | ||
| 3336 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 3337 | goto group_next; | ||
| 3338 | 3474 | ||
| 3339 | /* | 3475 | sum_avg_load_per_task += cpu_avg_load_per_task(i); |
| 3340 | * If the local group is idle or completely loaded | 3476 | } |
| 3341 | * no need to do power savings balance at this domain | ||
| 3342 | */ | ||
| 3343 | if (local_group && (this_nr_running >= group_capacity || | ||
| 3344 | !this_nr_running)) | ||
| 3345 | power_savings_balance = 0; | ||
| 3346 | 3477 | ||
| 3347 | /* | 3478 | /* |
| 3348 | * If a group is already running at full capacity or idle, | 3479 | * First idle cpu or the first cpu(busiest) in this sched group |
| 3349 | * don't include that group in power savings calculations | 3480 | * is eligible for doing load balancing at this and above |
| 3350 | */ | 3481 | * domains. In the newly idle case, we will allow all the cpu's |
| 3351 | if (!power_savings_balance || sum_nr_running >= group_capacity | 3482 | * to do the newly idle load balance. |
| 3352 | || !sum_nr_running) | 3483 | */ |
| 3353 | goto group_next; | 3484 | if (idle != CPU_NEWLY_IDLE && local_group && |
| 3485 | balance_cpu != this_cpu && balance) { | ||
| 3486 | *balance = 0; | ||
| 3487 | return; | ||
| 3488 | } | ||
| 3354 | 3489 | ||
| 3355 | /* | 3490 | /* Adjust by relative CPU power of the group */ |
| 3356 | * Calculate the group which has the least non-idle load. | 3491 | sgs->avg_load = sg_div_cpu_power(group, |
| 3357 | * This is the group from where we need to pick up the load | 3492 | sgs->group_load * SCHED_LOAD_SCALE); |
| 3358 | * for saving power | ||
| 3359 | */ | ||
| 3360 | if ((sum_nr_running < min_nr_running) || | ||
| 3361 | (sum_nr_running == min_nr_running && | ||
| 3362 | cpumask_first(sched_group_cpus(group)) > | ||
| 3363 | cpumask_first(sched_group_cpus(group_min)))) { | ||
| 3364 | group_min = group; | ||
| 3365 | min_nr_running = sum_nr_running; | ||
| 3366 | min_load_per_task = sum_weighted_load / | ||
| 3367 | sum_nr_running; | ||
| 3368 | } | ||
| 3369 | 3493 | ||
| 3370 | /* | 3494 | |
| 3371 | * Calculate the group which is almost near its | 3495 | /* |
| 3372 | * capacity but still has some space to pick up some load | 3496 | * Consider the group unbalanced when the imbalance is larger |
| 3373 | * from other group and save more power | 3497 | * than the average weight of two tasks. |
| 3374 | */ | 3498 | * |
| 3375 | if (sum_nr_running <= group_capacity - 1) { | 3499 | * APZ: with cgroup the avg task weight can vary wildly and |
| 3376 | if (sum_nr_running > leader_nr_running || | 3500 | * might not be a suitable number - should we keep a |
| 3377 | (sum_nr_running == leader_nr_running && | 3501 | * normalized nr_running number somewhere that negates |
| 3378 | cpumask_first(sched_group_cpus(group)) < | 3502 | * the hierarchy? |
| 3379 | cpumask_first(sched_group_cpus(group_leader)))) { | 3503 | */ |
| 3380 | group_leader = group; | 3504 | avg_load_per_task = sg_div_cpu_power(group, |
| 3381 | leader_nr_running = sum_nr_running; | 3505 | sum_avg_load_per_task * SCHED_LOAD_SCALE); |
| 3382 | } | 3506 | |
| 3507 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
| 3508 | sgs->group_imb = 1; | ||
| 3509 | |||
| 3510 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | ||
| 3511 | |||
| 3512 | } | ||
| 3513 | |||
| 3514 | /** | ||
| 3515 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
| 3516 | * @sd: sched_domain whose statistics are to be updated. | ||
| 3517 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3518 | * @idle: Idle status of this_cpu | ||
| 3519 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 3520 | * @cpus: Set of cpus considered for load balancing. | ||
| 3521 | * @balance: Should we balance. | ||
| 3522 | * @sds: variable to hold the statistics for this sched_domain. | ||
| 3523 | */ | ||
| 3524 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
| 3525 | enum cpu_idle_type idle, int *sd_idle, | ||
| 3526 | const struct cpumask *cpus, int *balance, | ||
| 3527 | struct sd_lb_stats *sds) | ||
| 3528 | { | ||
| 3529 | struct sched_group *group = sd->groups; | ||
| 3530 | struct sg_lb_stats sgs; | ||
| 3531 | int load_idx; | ||
| 3532 | |||
| 3533 | init_sd_power_savings_stats(sd, sds, idle); | ||
| 3534 | load_idx = get_sd_load_idx(sd, idle); | ||
| 3535 | |||
| 3536 | do { | ||
| 3537 | int local_group; | ||
| 3538 | |||
| 3539 | local_group = cpumask_test_cpu(this_cpu, | ||
| 3540 | sched_group_cpus(group)); | ||
| 3541 | memset(&sgs, 0, sizeof(sgs)); | ||
| 3542 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | ||
| 3543 | local_group, cpus, balance, &sgs); | ||
| 3544 | |||
| 3545 | if (local_group && balance && !(*balance)) | ||
| 3546 | return; | ||
| 3547 | |||
| 3548 | sds->total_load += sgs.group_load; | ||
| 3549 | sds->total_pwr += group->__cpu_power; | ||
| 3550 | |||
| 3551 | if (local_group) { | ||
| 3552 | sds->this_load = sgs.avg_load; | ||
| 3553 | sds->this = group; | ||
| 3554 | sds->this_nr_running = sgs.sum_nr_running; | ||
| 3555 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
| 3556 | } else if (sgs.avg_load > sds->max_load && | ||
| 3557 | (sgs.sum_nr_running > sgs.group_capacity || | ||
| 3558 | sgs.group_imb)) { | ||
| 3559 | sds->max_load = sgs.avg_load; | ||
| 3560 | sds->busiest = group; | ||
| 3561 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
| 3562 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
| 3563 | sds->group_imb = sgs.group_imb; | ||
| 3383 | } | 3564 | } |
| 3384 | group_next: | 3565 | |
| 3385 | #endif | 3566 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
| 3386 | group = group->next; | 3567 | group = group->next; |
| 3387 | } while (group != sd->groups); | 3568 | } while (group != sd->groups); |
| 3388 | 3569 | ||
| 3389 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) | 3570 | } |
| 3390 | goto out_balanced; | ||
| 3391 | |||
| 3392 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | ||
| 3393 | 3571 | ||
| 3394 | if (this_load >= avg_load || | 3572 | /** |
| 3395 | 100*max_load <= sd->imbalance_pct*this_load) | 3573 | * fix_small_imbalance - Calculate the minor imbalance that exists |
| 3396 | goto out_balanced; | 3574 | * amongst the groups of a sched_domain, during |
| 3575 | * load balancing. | ||
| 3576 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
| 3577 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3578 | * @imbalance: Variable to store the imbalance. | ||
| 3579 | */ | ||
| 3580 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
| 3581 | int this_cpu, unsigned long *imbalance) | ||
| 3582 | { | ||
| 3583 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
| 3584 | unsigned int imbn = 2; | ||
| 3585 | |||
| 3586 | if (sds->this_nr_running) { | ||
| 3587 | sds->this_load_per_task /= sds->this_nr_running; | ||
| 3588 | if (sds->busiest_load_per_task > | ||
| 3589 | sds->this_load_per_task) | ||
| 3590 | imbn = 1; | ||
| 3591 | } else | ||
| 3592 | sds->this_load_per_task = | ||
| 3593 | cpu_avg_load_per_task(this_cpu); | ||
| 3397 | 3594 | ||
| 3398 | busiest_load_per_task /= busiest_nr_running; | 3595 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= |
| 3399 | if (group_imb) | 3596 | sds->busiest_load_per_task * imbn) { |
| 3400 | busiest_load_per_task = min(busiest_load_per_task, avg_load); | 3597 | *imbalance = sds->busiest_load_per_task; |
| 3598 | return; | ||
| 3599 | } | ||
| 3401 | 3600 | ||
| 3402 | /* | 3601 | /* |
| 3403 | * We're trying to get all the cpus to the average_load, so we don't | 3602 | * OK, we don't have enough imbalance to justify moving tasks, |
| 3404 | * want to push ourselves above the average load, nor do we wish to | 3603 | * however we may be able to increase total CPU power used by |
| 3405 | * reduce the max loaded cpu below the average load, as either of these | 3604 | * moving them. |
| 3406 | * actions would just result in more rebalancing later, and ping-pong | ||
| 3407 | * tasks around. Thus we look for the minimum possible imbalance. | ||
| 3408 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
| 3409 | * be counted as no imbalance for these purposes -- we can't fix that | ||
| 3410 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
| 3411 | * appear as very large values with unsigned longs. | ||
| 3412 | */ | 3605 | */ |
| 3413 | if (max_load <= busiest_load_per_task) | ||
| 3414 | goto out_balanced; | ||
| 3415 | 3606 | ||
| 3607 | pwr_now += sds->busiest->__cpu_power * | ||
| 3608 | min(sds->busiest_load_per_task, sds->max_load); | ||
| 3609 | pwr_now += sds->this->__cpu_power * | ||
| 3610 | min(sds->this_load_per_task, sds->this_load); | ||
| 3611 | pwr_now /= SCHED_LOAD_SCALE; | ||
| 3612 | |||
| 3613 | /* Amount of load we'd subtract */ | ||
| 3614 | tmp = sg_div_cpu_power(sds->busiest, | ||
| 3615 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | ||
| 3616 | if (sds->max_load > tmp) | ||
| 3617 | pwr_move += sds->busiest->__cpu_power * | ||
| 3618 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
| 3619 | |||
| 3620 | /* Amount of load we'd add */ | ||
| 3621 | if (sds->max_load * sds->busiest->__cpu_power < | ||
| 3622 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
| 3623 | tmp = sg_div_cpu_power(sds->this, | ||
| 3624 | sds->max_load * sds->busiest->__cpu_power); | ||
| 3625 | else | ||
| 3626 | tmp = sg_div_cpu_power(sds->this, | ||
| 3627 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | ||
| 3628 | pwr_move += sds->this->__cpu_power * | ||
| 3629 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
| 3630 | pwr_move /= SCHED_LOAD_SCALE; | ||
| 3631 | |||
| 3632 | /* Move if we gain throughput */ | ||
| 3633 | if (pwr_move > pwr_now) | ||
| 3634 | *imbalance = sds->busiest_load_per_task; | ||
| 3635 | } | ||
| 3636 | |||
| 3637 | /** | ||
| 3638 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
| 3639 | * groups of a given sched_domain during load balance. | ||
| 3640 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
| 3641 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
| 3642 | * @imbalance: The variable to store the imbalance. | ||
| 3643 | */ | ||
| 3644 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
| 3645 | unsigned long *imbalance) | ||
| 3646 | { | ||
| 3647 | unsigned long max_pull; | ||
| 3416 | /* | 3648 | /* |
| 3417 | * In the presence of smp nice balancing, certain scenarios can have | 3649 | * In the presence of smp nice balancing, certain scenarios can have |
| 3418 | * max load less than avg load(as we skip the groups at or below | 3650 | * max load less than avg load(as we skip the groups at or below |
| 3419 | * its cpu_power, while calculating max_load..) | 3651 | * its cpu_power, while calculating max_load..) |
| 3420 | */ | 3652 | */ |
| 3421 | if (max_load < avg_load) { | 3653 | if (sds->max_load < sds->avg_load) { |
| 3422 | *imbalance = 0; | 3654 | *imbalance = 0; |
| 3423 | goto small_imbalance; | 3655 | return fix_small_imbalance(sds, this_cpu, imbalance); |
| 3424 | } | 3656 | } |
| 3425 | 3657 | ||
| 3426 | /* Don't want to pull so many tasks that a group would go idle */ | 3658 | /* Don't want to pull so many tasks that a group would go idle */ |
| 3427 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 3659 | max_pull = min(sds->max_load - sds->avg_load, |
| 3660 | sds->max_load - sds->busiest_load_per_task); | ||
| 3428 | 3661 | ||
| 3429 | /* How much load to actually move to equalise the imbalance */ | 3662 | /* How much load to actually move to equalise the imbalance */ |
| 3430 | *imbalance = min(max_pull * busiest->__cpu_power, | 3663 | *imbalance = min(max_pull * sds->busiest->__cpu_power, |
| 3431 | (avg_load - this_load) * this->__cpu_power) | 3664 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) |
| 3432 | / SCHED_LOAD_SCALE; | 3665 | / SCHED_LOAD_SCALE; |
| 3433 | 3666 | ||
| 3434 | /* | 3667 | /* |
| @@ -3437,78 +3670,110 @@ group_next: | |||
| 3437 | * a think about bumping its value to force at least one task to be | 3670 | * a think about bumping its value to force at least one task to be |
| 3438 | * moved | 3671 | * moved |
| 3439 | */ | 3672 | */ |
| 3440 | if (*imbalance < busiest_load_per_task) { | 3673 | if (*imbalance < sds->busiest_load_per_task) |
| 3441 | unsigned long tmp, pwr_now, pwr_move; | 3674 | return fix_small_imbalance(sds, this_cpu, imbalance); |
| 3442 | unsigned int imbn; | ||
| 3443 | |||
| 3444 | small_imbalance: | ||
| 3445 | pwr_move = pwr_now = 0; | ||
| 3446 | imbn = 2; | ||
| 3447 | if (this_nr_running) { | ||
| 3448 | this_load_per_task /= this_nr_running; | ||
| 3449 | if (busiest_load_per_task > this_load_per_task) | ||
| 3450 | imbn = 1; | ||
| 3451 | } else | ||
| 3452 | this_load_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 3453 | 3675 | ||
| 3454 | if (max_load - this_load + busiest_load_per_task >= | 3676 | } |
| 3455 | busiest_load_per_task * imbn) { | 3677 | /******* find_busiest_group() helpers end here *********************/ |
| 3456 | *imbalance = busiest_load_per_task; | ||
| 3457 | return busiest; | ||
| 3458 | } | ||
| 3459 | 3678 | ||
| 3460 | /* | 3679 | /** |
| 3461 | * OK, we don't have enough imbalance to justify moving tasks, | 3680 | * find_busiest_group - Returns the busiest group within the sched_domain |
| 3462 | * however we may be able to increase total CPU power used by | 3681 | * if there is an imbalance. If there isn't an imbalance, and |
| 3463 | * moving them. | 3682 | * the user has opted for power-savings, it returns a group whose |
| 3464 | */ | 3683 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if |
| 3684 | * such a group exists. | ||
| 3685 | * | ||
| 3686 | * Also calculates the amount of weighted load which should be moved | ||
| 3687 | * to restore balance. | ||
| 3688 | * | ||
| 3689 | * @sd: The sched_domain whose busiest group is to be returned. | ||
| 3690 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
| 3691 | * @imbalance: Variable which stores amount of weighted load which should | ||
| 3692 | * be moved to restore balance/put a group to idle. | ||
| 3693 | * @idle: The idle status of this_cpu. | ||
| 3694 | * @sd_idle: The idleness of sd | ||
| 3695 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
| 3696 | * @balance: Pointer to a variable indicating if this_cpu | ||
| 3697 | * is the appropriate cpu to perform load balancing at this_level. | ||
| 3698 | * | ||
| 3699 | * Returns: - the busiest group if imbalance exists. | ||
| 3700 | * - If no imbalance and user has opted for power-savings balance, | ||
| 3701 | * return the least loaded group whose CPUs can be | ||
| 3702 | * put to idle by rebalancing its tasks onto our group. | ||
| 3703 | */ | ||
| 3704 | static struct sched_group * | ||
| 3705 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
| 3706 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
| 3707 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
| 3708 | { | ||
| 3709 | struct sd_lb_stats sds; | ||
| 3465 | 3710 | ||
| 3466 | pwr_now += busiest->__cpu_power * | 3711 | memset(&sds, 0, sizeof(sds)); |
| 3467 | min(busiest_load_per_task, max_load); | ||
| 3468 | pwr_now += this->__cpu_power * | ||
| 3469 | min(this_load_per_task, this_load); | ||
| 3470 | pwr_now /= SCHED_LOAD_SCALE; | ||
| 3471 | |||
| 3472 | /* Amount of load we'd subtract */ | ||
| 3473 | tmp = sg_div_cpu_power(busiest, | ||
| 3474 | busiest_load_per_task * SCHED_LOAD_SCALE); | ||
| 3475 | if (max_load > tmp) | ||
| 3476 | pwr_move += busiest->__cpu_power * | ||
| 3477 | min(busiest_load_per_task, max_load - tmp); | ||
| 3478 | |||
| 3479 | /* Amount of load we'd add */ | ||
| 3480 | if (max_load * busiest->__cpu_power < | ||
| 3481 | busiest_load_per_task * SCHED_LOAD_SCALE) | ||
| 3482 | tmp = sg_div_cpu_power(this, | ||
| 3483 | max_load * busiest->__cpu_power); | ||
| 3484 | else | ||
| 3485 | tmp = sg_div_cpu_power(this, | ||
| 3486 | busiest_load_per_task * SCHED_LOAD_SCALE); | ||
| 3487 | pwr_move += this->__cpu_power * | ||
| 3488 | min(this_load_per_task, this_load + tmp); | ||
| 3489 | pwr_move /= SCHED_LOAD_SCALE; | ||
| 3490 | 3712 | ||
| 3491 | /* Move if we gain throughput */ | 3713 | /* |
| 3492 | if (pwr_move > pwr_now) | 3714 | * Compute the various statistics relavent for load balancing at |
| 3493 | *imbalance = busiest_load_per_task; | 3715 | * this level. |
| 3494 | } | 3716 | */ |
| 3717 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
| 3718 | balance, &sds); | ||
| 3719 | |||
| 3720 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
| 3721 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
| 3722 | * at this level. | ||
| 3723 | * 2) There is no busy sibling group to pull from. | ||
| 3724 | * 3) This group is the busiest group. | ||
| 3725 | * 4) This group is more busy than the avg busieness at this | ||
| 3726 | * sched_domain. | ||
| 3727 | * 5) The imbalance is within the specified limit. | ||
| 3728 | * 6) Any rebalance would lead to ping-pong | ||
| 3729 | */ | ||
| 3730 | if (balance && !(*balance)) | ||
| 3731 | goto ret; | ||
| 3495 | 3732 | ||
| 3496 | return busiest; | 3733 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 3734 | goto out_balanced; | ||
| 3497 | 3735 | ||
| 3498 | out_balanced: | 3736 | if (sds.this_load >= sds.max_load) |
| 3499 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 3737 | goto out_balanced; |
| 3500 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 3501 | goto ret; | ||
| 3502 | 3738 | ||
| 3503 | if (this == group_leader && group_leader != group_min) { | 3739 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
| 3504 | *imbalance = min_load_per_task; | 3740 | |
| 3505 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | 3741 | if (sds.this_load >= sds.avg_load) |
| 3506 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | 3742 | goto out_balanced; |
| 3507 | cpumask_first(sched_group_cpus(group_leader)); | 3743 | |
| 3508 | } | 3744 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
| 3509 | return group_min; | 3745 | goto out_balanced; |
| 3510 | } | 3746 | |
| 3511 | #endif | 3747 | sds.busiest_load_per_task /= sds.busiest_nr_running; |
| 3748 | if (sds.group_imb) | ||
| 3749 | sds.busiest_load_per_task = | ||
| 3750 | min(sds.busiest_load_per_task, sds.avg_load); | ||
| 3751 | |||
| 3752 | /* | ||
| 3753 | * We're trying to get all the cpus to the average_load, so we don't | ||
| 3754 | * want to push ourselves above the average load, nor do we wish to | ||
| 3755 | * reduce the max loaded cpu below the average load, as either of these | ||
| 3756 | * actions would just result in more rebalancing later, and ping-pong | ||
| 3757 | * tasks around. Thus we look for the minimum possible imbalance. | ||
| 3758 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
| 3759 | * be counted as no imbalance for these purposes -- we can't fix that | ||
| 3760 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
| 3761 | * appear as very large values with unsigned longs. | ||
| 3762 | */ | ||
| 3763 | if (sds.max_load <= sds.busiest_load_per_task) | ||
| 3764 | goto out_balanced; | ||
| 3765 | |||
| 3766 | /* Looks like there is an imbalance. Compute it */ | ||
| 3767 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
| 3768 | return sds.busiest; | ||
| 3769 | |||
| 3770 | out_balanced: | ||
| 3771 | /* | ||
| 3772 | * There is no obvious imbalance. But check if we can do some balancing | ||
| 3773 | * to save power. | ||
| 3774 | */ | ||
| 3775 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
| 3776 | return sds.busiest; | ||
| 3512 | ret: | 3777 | ret: |
| 3513 | *imbalance = 0; | 3778 | *imbalance = 0; |
| 3514 | return NULL; | 3779 | return NULL; |
