diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 632 |
1 files changed, 373 insertions, 259 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..7f0a5e6cdae0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated) | |||
851 | { | 851 | { |
852 | struct task_struct *p = current; | 852 | struct task_struct *p = current; |
853 | 853 | ||
854 | if (!sched_feat_numa(NUMA)) | 854 | if (!numabalancing_enabled) |
855 | return; | 855 | return; |
856 | 856 | ||
857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 857 | /* FIXME: Allocate task-specific structure for placement policy here */ |
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
2032 | */ | 2032 | */ |
2033 | update_entity_load_avg(curr, 1); | 2033 | update_entity_load_avg(curr, 1); |
2034 | update_cfs_rq_blocked_load(cfs_rq, 1); | 2034 | update_cfs_rq_blocked_load(cfs_rq, 1); |
2035 | update_cfs_shares(cfs_rq); | ||
2035 | 2036 | ||
2036 | #ifdef CONFIG_SCHED_HRTICK | 2037 | #ifdef CONFIG_SCHED_HRTICK |
2037 | /* | 2038 | /* |
@@ -3017,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
3017 | return 0; | 3018 | return 0; |
3018 | } | 3019 | } |
3019 | 3020 | ||
3021 | static void record_wakee(struct task_struct *p) | ||
3022 | { | ||
3023 | /* | ||
3024 | * Rough decay (wiping) for cost saving, don't worry | ||
3025 | * about the boundary, really active task won't care | ||
3026 | * about the loss. | ||
3027 | */ | ||
3028 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | ||
3029 | current->wakee_flips = 0; | ||
3030 | current->wakee_flip_decay_ts = jiffies; | ||
3031 | } | ||
3032 | |||
3033 | if (current->last_wakee != p) { | ||
3034 | current->last_wakee = p; | ||
3035 | current->wakee_flips++; | ||
3036 | } | ||
3037 | } | ||
3020 | 3038 | ||
3021 | static void task_waking_fair(struct task_struct *p) | 3039 | static void task_waking_fair(struct task_struct *p) |
3022 | { | 3040 | { |
@@ -3037,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) | |||
3037 | #endif | 3055 | #endif |
3038 | 3056 | ||
3039 | se->vruntime -= min_vruntime; | 3057 | se->vruntime -= min_vruntime; |
3058 | record_wakee(p); | ||
3040 | } | 3059 | } |
3041 | 3060 | ||
3042 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3061 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3155,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
3155 | 3174 | ||
3156 | #endif | 3175 | #endif |
3157 | 3176 | ||
3177 | static int wake_wide(struct task_struct *p) | ||
3178 | { | ||
3179 | int factor = this_cpu_read(sd_llc_size); | ||
3180 | |||
3181 | /* | ||
3182 | * Yeah, it's the switching-frequency, could means many wakee or | ||
3183 | * rapidly switch, use factor here will just help to automatically | ||
3184 | * adjust the loose-degree, so bigger node will lead to more pull. | ||
3185 | */ | ||
3186 | if (p->wakee_flips > factor) { | ||
3187 | /* | ||
3188 | * wakee is somewhat hot, it needs certain amount of cpu | ||
3189 | * resource, so if waker is far more hot, prefer to leave | ||
3190 | * it alone. | ||
3191 | */ | ||
3192 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
3193 | return 1; | ||
3194 | } | ||
3195 | |||
3196 | return 0; | ||
3197 | } | ||
3198 | |||
3158 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 3199 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
3159 | { | 3200 | { |
3160 | s64 this_load, load; | 3201 | s64 this_load, load; |
@@ -3164,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
3164 | unsigned long weight; | 3205 | unsigned long weight; |
3165 | int balanced; | 3206 | int balanced; |
3166 | 3207 | ||
3208 | /* | ||
3209 | * If we wake multiple tasks be careful to not bounce | ||
3210 | * ourselves around too much. | ||
3211 | */ | ||
3212 | if (wake_wide(p)) | ||
3213 | return 0; | ||
3214 | |||
3167 | idx = sd->wake_idx; | 3215 | idx = sd->wake_idx; |
3168 | this_cpu = smp_processor_id(); | 3216 | this_cpu = smp_processor_id(); |
3169 | prev_cpu = task_cpu(p); | 3217 | prev_cpu = task_cpu(p); |
@@ -4171,47 +4219,48 @@ static void update_blocked_averages(int cpu) | |||
4171 | } | 4219 | } |
4172 | 4220 | ||
4173 | /* | 4221 | /* |
4174 | * Compute the cpu's hierarchical load factor for each task group. | 4222 | * Compute the hierarchical load factor for cfs_rq and all its ascendants. |
4175 | * This needs to be done in a top-down fashion because the load of a child | 4223 | * This needs to be done in a top-down fashion because the load of a child |
4176 | * group is a fraction of its parents load. | 4224 | * group is a fraction of its parents load. |
4177 | */ | 4225 | */ |
4178 | static int tg_load_down(struct task_group *tg, void *data) | 4226 | static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) |
4179 | { | ||
4180 | unsigned long load; | ||
4181 | long cpu = (long)data; | ||
4182 | |||
4183 | if (!tg->parent) { | ||
4184 | load = cpu_rq(cpu)->avg.load_avg_contrib; | ||
4185 | } else { | ||
4186 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
4187 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, | ||
4188 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); | ||
4189 | } | ||
4190 | |||
4191 | tg->cfs_rq[cpu]->h_load = load; | ||
4192 | |||
4193 | return 0; | ||
4194 | } | ||
4195 | |||
4196 | static void update_h_load(long cpu) | ||
4197 | { | 4227 | { |
4198 | struct rq *rq = cpu_rq(cpu); | 4228 | struct rq *rq = rq_of(cfs_rq); |
4229 | struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; | ||
4199 | unsigned long now = jiffies; | 4230 | unsigned long now = jiffies; |
4231 | unsigned long load; | ||
4200 | 4232 | ||
4201 | if (rq->h_load_throttle == now) | 4233 | if (cfs_rq->last_h_load_update == now) |
4202 | return; | 4234 | return; |
4203 | 4235 | ||
4204 | rq->h_load_throttle = now; | 4236 | cfs_rq->h_load_next = NULL; |
4237 | for_each_sched_entity(se) { | ||
4238 | cfs_rq = cfs_rq_of(se); | ||
4239 | cfs_rq->h_load_next = se; | ||
4240 | if (cfs_rq->last_h_load_update == now) | ||
4241 | break; | ||
4242 | } | ||
4205 | 4243 | ||
4206 | rcu_read_lock(); | 4244 | if (!se) { |
4207 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 4245 | cfs_rq->h_load = rq->avg.load_avg_contrib; |
4208 | rcu_read_unlock(); | 4246 | cfs_rq->last_h_load_update = now; |
4247 | } | ||
4248 | |||
4249 | while ((se = cfs_rq->h_load_next) != NULL) { | ||
4250 | load = cfs_rq->h_load; | ||
4251 | load = div64_ul(load * se->avg.load_avg_contrib, | ||
4252 | cfs_rq->runnable_load_avg + 1); | ||
4253 | cfs_rq = group_cfs_rq(se); | ||
4254 | cfs_rq->h_load = load; | ||
4255 | cfs_rq->last_h_load_update = now; | ||
4256 | } | ||
4209 | } | 4257 | } |
4210 | 4258 | ||
4211 | static unsigned long task_h_load(struct task_struct *p) | 4259 | static unsigned long task_h_load(struct task_struct *p) |
4212 | { | 4260 | { |
4213 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4261 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4214 | 4262 | ||
4263 | update_cfs_rq_h_load(cfs_rq); | ||
4215 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 4264 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4216 | cfs_rq->runnable_load_avg + 1); | 4265 | cfs_rq->runnable_load_avg + 1); |
4217 | } | 4266 | } |
@@ -4220,10 +4269,6 @@ static inline void update_blocked_averages(int cpu) | |||
4220 | { | 4269 | { |
4221 | } | 4270 | } |
4222 | 4271 | ||
4223 | static inline void update_h_load(long cpu) | ||
4224 | { | ||
4225 | } | ||
4226 | |||
4227 | static unsigned long task_h_load(struct task_struct *p) | 4272 | static unsigned long task_h_load(struct task_struct *p) |
4228 | { | 4273 | { |
4229 | return p->se.avg.load_avg_contrib; | 4274 | return p->se.avg.load_avg_contrib; |
@@ -4232,54 +4277,62 @@ static unsigned long task_h_load(struct task_struct *p) | |||
4232 | 4277 | ||
4233 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
4234 | /* | 4279 | /* |
4235 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4236 | * during load balancing. | ||
4237 | */ | ||
4238 | struct sd_lb_stats { | ||
4239 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4240 | struct sched_group *this; /* Local group in this sd */ | ||
4241 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4242 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4243 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4244 | |||
4245 | /** Statistics of this group */ | ||
4246 | unsigned long this_load; | ||
4247 | unsigned long this_load_per_task; | ||
4248 | unsigned long this_nr_running; | ||
4249 | unsigned long this_has_capacity; | ||
4250 | unsigned int this_idle_cpus; | ||
4251 | |||
4252 | /* Statistics of the busiest group */ | ||
4253 | unsigned int busiest_idle_cpus; | ||
4254 | unsigned long max_load; | ||
4255 | unsigned long busiest_load_per_task; | ||
4256 | unsigned long busiest_nr_running; | ||
4257 | unsigned long busiest_group_capacity; | ||
4258 | unsigned long busiest_has_capacity; | ||
4259 | unsigned int busiest_group_weight; | ||
4260 | |||
4261 | int group_imb; /* Is there imbalance in this sd */ | ||
4262 | }; | ||
4263 | |||
4264 | /* | ||
4265 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
4266 | */ | 4281 | */ |
4267 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
4268 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
4269 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
4270 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
4271 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
4272 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
4273 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
4274 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
4289 | unsigned int group_capacity; | ||
4290 | unsigned int idle_cpus; | ||
4291 | unsigned int group_weight; | ||
4275 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
4276 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
4277 | }; | 4294 | }; |
4278 | 4295 | ||
4296 | /* | ||
4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4298 | * during load balancing. | ||
4299 | */ | ||
4300 | struct sd_lb_stats { | ||
4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4302 | struct sched_group *local; /* Local group in this sd */ | ||
4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4306 | |||
4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
4309 | }; | ||
4310 | |||
4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
4312 | { | ||
4313 | /* | ||
4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
4316 | * We must however clear busiest_stat::avg_load because | ||
4317 | * update_sd_pick_busiest() reads this before assignment. | ||
4318 | */ | ||
4319 | *sds = (struct sd_lb_stats){ | ||
4320 | .busiest = NULL, | ||
4321 | .local = NULL, | ||
4322 | .total_load = 0UL, | ||
4323 | .total_pwr = 0UL, | ||
4324 | .busiest_stat = { | ||
4325 | .avg_load = 0UL, | ||
4326 | }, | ||
4327 | }; | ||
4328 | } | ||
4329 | |||
4279 | /** | 4330 | /** |
4280 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4281 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
4282 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 4333 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
4334 | * | ||
4335 | * Return: The load index. | ||
4283 | */ | 4336 | */ |
4284 | static inline int get_sd_load_idx(struct sched_domain *sd, | 4337 | static inline int get_sd_load_idx(struct sched_domain *sd, |
4285 | enum cpu_idle_type idle) | 4338 | enum cpu_idle_type idle) |
@@ -4457,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4457 | return 0; | 4510 | return 0; |
4458 | } | 4511 | } |
4459 | 4512 | ||
4513 | /* | ||
4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
4516 | * | ||
4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
4519 | * Something like: | ||
4520 | * | ||
4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
4522 | * * * * * | ||
4523 | * | ||
4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
4527 | * | ||
4528 | * The current solution to this issue is detecting the skew in the first group | ||
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
4531 | * sg_imbalanced(). | ||
4532 | * | ||
4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
4536 | * to create an effective group imbalance. | ||
4537 | * | ||
4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
4540 | * subtle and fragile situation. | ||
4541 | */ | ||
4542 | |||
4543 | struct sg_imb_stats { | ||
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | ||
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | ||
4553 | |||
4554 | static inline void | ||
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
4556 | unsigned long load, unsigned long nr_running) | ||
4557 | { | ||
4558 | if (load > sgi->max_cpu_load) | ||
4559 | sgi->max_cpu_load = load; | ||
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | |||
4563 | if (nr_running > sgi->max_nr_running) | ||
4564 | sgi->max_nr_running = nr_running; | ||
4565 | if (sgi->min_nr_running > nr_running) | ||
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | |||
4569 | static inline int | ||
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
4571 | { | ||
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | |||
4585 | return 0; | ||
4586 | } | ||
4587 | |||
4460 | /** | 4588 | /** |
4461 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
4462 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
4463 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
4464 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
4465 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
4466 | * @balance: Should we balance. | ||
4467 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
4468 | */ | 4595 | */ |
4469 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
4470 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
4471 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
4472 | { | 4599 | { |
4473 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
4474 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
4475 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
4476 | unsigned long avg_load_per_task = 0; | ||
4477 | int i; | 4603 | int i; |
4478 | 4604 | ||
4479 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
4480 | balance_cpu = group_balance_cpu(group); | ||
4481 | |||
4482 | /* Tally up the load of all CPUs in the group */ | ||
4483 | max_cpu_load = 0; | ||
4484 | min_cpu_load = ~0UL; | ||
4485 | max_nr_running = 0; | ||
4486 | min_nr_running = ~0UL; | ||
4487 | 4606 | ||
4488 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4489 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
@@ -4492,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4492 | 4611 | ||
4493 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
4494 | if (local_group) { | 4613 | if (local_group) { |
4495 | if (idle_cpu(i) && !first_idle_cpu && | ||
4496 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
4497 | first_idle_cpu = 1; | ||
4498 | balance_cpu = i; | ||
4499 | } | ||
4500 | |||
4501 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
4502 | } else { | 4615 | } else { |
4503 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
4504 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
4505 | max_cpu_load = load; | ||
4506 | if (min_cpu_load > load) | ||
4507 | min_cpu_load = load; | ||
4508 | |||
4509 | if (nr_running > max_nr_running) | ||
4510 | max_nr_running = nr_running; | ||
4511 | if (min_nr_running > nr_running) | ||
4512 | min_nr_running = nr_running; | ||
4513 | } | 4618 | } |
4514 | 4619 | ||
4515 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
@@ -4519,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4519 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
4520 | } | 4625 | } |
4521 | 4626 | ||
4522 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
4523 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
4524 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
4525 | * domains. In the newly idle case, we will allow all the cpu's | ||
4526 | * to do the newly idle load balance. | ||
4527 | */ | ||
4528 | if (local_group) { | ||
4529 | if (env->idle != CPU_NEWLY_IDLE) { | ||
4530 | if (balance_cpu != env->dst_cpu) { | ||
4531 | *balance = 0; | ||
4532 | return; | ||
4533 | } | ||
4534 | update_group_power(env->sd, env->dst_cpu); | ||
4535 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
4536 | update_group_power(env->sd, env->dst_cpu); | ||
4537 | } | ||
4538 | 4630 | ||
4539 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
4540 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
4541 | 4634 | ||
4542 | /* | ||
4543 | * Consider the group unbalanced when the imbalance is larger | ||
4544 | * than the average weight of a task. | ||
4545 | * | ||
4546 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4547 | * might not be a suitable number - should we keep a | ||
4548 | * normalized nr_running number somewhere that negates | ||
4549 | * the hierarchy? | ||
4550 | */ | ||
4551 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
4552 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4637 | |||
4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
4553 | 4639 | ||
4554 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4640 | sgs->group_capacity = |
4555 | (max_nr_running - min_nr_running) > 1) | 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); |
4556 | sgs->group_imb = 1; | ||
4557 | 4642 | ||
4558 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
4559 | SCHED_POWER_SCALE); | ||
4560 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
4561 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
4645 | |||
4562 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
4563 | 4647 | ||
4564 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -4574,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4574 | * | 4658 | * |
4575 | * Determine if @sg is a busier group than the previously selected | 4659 | * Determine if @sg is a busier group than the previously selected |
4576 | * busiest group. | 4660 | * busiest group. |
4661 | * | ||
4662 | * Return: %true if @sg is a busier group than the previously selected | ||
4663 | * busiest group. %false otherwise. | ||
4577 | */ | 4664 | */ |
4578 | static bool update_sd_pick_busiest(struct lb_env *env, | 4665 | static bool update_sd_pick_busiest(struct lb_env *env, |
4579 | struct sd_lb_stats *sds, | 4666 | struct sd_lb_stats *sds, |
4580 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
4581 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
4582 | { | 4669 | { |
4583 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
4584 | return false; | 4671 | return false; |
4585 | 4672 | ||
4586 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
@@ -4613,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4613 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
4614 | */ | 4701 | */ |
4615 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
4616 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
4617 | { | 4704 | { |
4618 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
4619 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
4620 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
4621 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
4622 | 4709 | ||
4623 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
@@ -4626,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4626 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
4627 | 4714 | ||
4628 | do { | 4715 | do { |
4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
4629 | int local_group; | 4717 | int local_group; |
4630 | 4718 | ||
4631 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
4632 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
4633 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
4634 | 4722 | sgs = &sds->local_stat; | |
4635 | if (local_group && !(*balance)) | 4723 | } |
4636 | return; | ||
4637 | 4724 | ||
4638 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
4639 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4640 | 4727 | ||
4641 | /* | 4728 | /* |
4642 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
@@ -4648,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4648 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
4649 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
4650 | */ | 4737 | */ |
4651 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
4652 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
4653 | 4741 | ||
4654 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
4655 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
4656 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
4657 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
4658 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
4659 | sds->this_has_capacity = sgs.group_has_capacity; | ||
4660 | sds->this_idle_cpus = sgs.idle_cpus; | ||
4661 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
4662 | sds->max_load = sgs.avg_load; | ||
4663 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
4664 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
4665 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
4666 | sds->busiest_group_capacity = sgs.group_capacity; | ||
4667 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
4668 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
4669 | sds->busiest_group_weight = sgs.group_weight; | ||
4670 | sds->group_imb = sgs.group_imb; | ||
4671 | } | 4749 | } |
4672 | 4750 | ||
4673 | sg = sg->next; | 4751 | sg = sg->next; |
@@ -4691,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4691 | * assuming lower CPU number will be equivalent to lower a SMT thread | 4769 | * assuming lower CPU number will be equivalent to lower a SMT thread |
4692 | * number. | 4770 | * number. |
4693 | * | 4771 | * |
4694 | * Returns 1 when packing is required and a task should be moved to | 4772 | * Return: 1 when packing is required and a task should be moved to |
4695 | * this CPU. The amount of the imbalance is returned in *imbalance. | 4773 | * this CPU. The amount of the imbalance is returned in *imbalance. |
4696 | * | 4774 | * |
4697 | * @env: The load balancing environment. | 4775 | * @env: The load balancing environment. |
@@ -4712,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
4712 | return 0; | 4790 | return 0; |
4713 | 4791 | ||
4714 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
4715 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
4794 | SCHED_POWER_SCALE); | ||
4716 | 4795 | ||
4717 | return 1; | 4796 | return 1; |
4718 | } | 4797 | } |
@@ -4730,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4730 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4731 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
4732 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
4812 | struct sg_lb_stats *local, *busiest; | ||
4733 | 4813 | ||
4734 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
4735 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
4736 | if (sds->busiest_load_per_task > | 4816 | |
4737 | sds->this_load_per_task) | 4817 | if (!local->sum_nr_running) |
4738 | imbn = 1; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
4739 | } else { | 4819 | else if (busiest->load_per_task > local->load_per_task) |
4740 | sds->this_load_per_task = | 4820 | imbn = 1; |
4741 | cpu_avg_load_per_task(env->dst_cpu); | ||
4742 | } | ||
4743 | 4821 | ||
4744 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4822 | scaled_busy_load_per_task = |
4745 | * SCHED_POWER_SCALE; | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
4746 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4824 | busiest->group_power; |
4747 | 4825 | ||
4748 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4826 | if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= |
4749 | (scaled_busy_load_per_task * imbn)) { | 4827 | (scaled_busy_load_per_task * imbn)) { |
4750 | env->imbalance = sds->busiest_load_per_task; | 4828 | env->imbalance = busiest->load_per_task; |
4751 | return; | 4829 | return; |
4752 | } | 4830 | } |
4753 | 4831 | ||
@@ -4757,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4757 | * moving them. | 4835 | * moving them. |
4758 | */ | 4836 | */ |
4759 | 4837 | ||
4760 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
4761 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
4762 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
4763 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
4764 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
4765 | 4843 | ||
4766 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
4767 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4768 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
4769 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
4770 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
4771 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
4850 | busiest->avg_load - tmp); | ||
4851 | } | ||
4772 | 4852 | ||
4773 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
4774 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
4775 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
4776 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
4777 | sds->this->sgp->power; | 4857 | local->group_power; |
4778 | else | 4858 | } else { |
4779 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4780 | sds->this->sgp->power; | 4860 | local->group_power; |
4781 | pwr_move += sds->this->sgp->power * | 4861 | } |
4782 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
4863 | min(local->load_per_task, local->avg_load + tmp); | ||
4783 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
4784 | 4865 | ||
4785 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
4786 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
4787 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
4788 | } | 4869 | } |
4789 | 4870 | ||
4790 | /** | 4871 | /** |
@@ -4796,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4796 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4797 | { | 4878 | { |
4798 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
4880 | struct sg_lb_stats *local, *busiest; | ||
4799 | 4881 | ||
4800 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4882 | local = &sds->local_stat; |
4801 | if (sds->group_imb) { | 4883 | busiest = &sds->busiest_stat; |
4802 | sds->busiest_load_per_task = | 4884 | |
4803 | min(sds->busiest_load_per_task, sds->avg_load); | 4885 | if (busiest->group_imb) { |
4886 | /* | ||
4887 | * In the group_imb case we cannot rely on group-wide averages | ||
4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX | ||
4889 | */ | ||
4890 | busiest->load_per_task = | ||
4891 | min(busiest->load_per_task, sds->avg_load); | ||
4804 | } | 4892 | } |
4805 | 4893 | ||
4806 | /* | 4894 | /* |
@@ -4808,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4808 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
4809 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
4810 | */ | 4898 | */ |
4811 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load < sds->avg_load) { |
4812 | env->imbalance = 0; | 4900 | env->imbalance = 0; |
4813 | return fix_small_imbalance(env, sds); | 4901 | return fix_small_imbalance(env, sds); |
4814 | } | 4902 | } |
4815 | 4903 | ||
4816 | if (!sds->group_imb) { | 4904 | if (!busiest->group_imb) { |
4817 | /* | 4905 | /* |
4818 | * Don't want to pull so many tasks that a group would go idle. | 4906 | * Don't want to pull so many tasks that a group would go idle. |
4907 | * Except of course for the group_imb case, since then we might | ||
4908 | * have to drop below capacity to reach cpu-load equilibrium. | ||
4819 | */ | 4909 | */ |
4820 | load_above_capacity = (sds->busiest_nr_running - | 4910 | load_above_capacity = |
4821 | sds->busiest_group_capacity); | 4911 | (busiest->sum_nr_running - busiest->group_capacity); |
4822 | 4912 | ||
4823 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4913 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
4824 | 4914 | load_above_capacity /= busiest->group_power; | |
4825 | load_above_capacity /= sds->busiest->sgp->power; | ||
4826 | } | 4915 | } |
4827 | 4916 | ||
4828 | /* | 4917 | /* |
@@ -4832,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4832 | * we also don't want to reduce the group load below the group capacity | 4921 | * we also don't want to reduce the group load below the group capacity |
4833 | * (so that we can implement power-savings policies etc). Thus we look | 4922 | * (so that we can implement power-savings policies etc). Thus we look |
4834 | * for the minimum possible imbalance. | 4923 | * for the minimum possible imbalance. |
4835 | * Be careful of negative numbers as they'll appear as very large values | ||
4836 | * with unsigned longs. | ||
4837 | */ | 4924 | */ |
4838 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4925 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
4839 | 4926 | ||
4840 | /* How much load to actually move to equalise the imbalance */ | 4927 | /* How much load to actually move to equalise the imbalance */ |
4841 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4928 | env->imbalance = min( |
4842 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4929 | max_pull * busiest->group_power, |
4843 | / SCHED_POWER_SCALE; | 4930 | (sds->avg_load - local->avg_load) * local->group_power |
4931 | ) / SCHED_POWER_SCALE; | ||
4844 | 4932 | ||
4845 | /* | 4933 | /* |
4846 | * if *imbalance is less than the average load per runnable task | 4934 | * if *imbalance is less than the average load per runnable task |
@@ -4848,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4848 | * a think about bumping its value to force at least one task to be | 4936 | * a think about bumping its value to force at least one task to be |
4849 | * moved | 4937 | * moved |
4850 | */ | 4938 | */ |
4851 | if (env->imbalance < sds->busiest_load_per_task) | 4939 | if (env->imbalance < busiest->load_per_task) |
4852 | return fix_small_imbalance(env, sds); | 4940 | return fix_small_imbalance(env, sds); |
4853 | |||
4854 | } | 4941 | } |
4855 | 4942 | ||
4856 | /******* find_busiest_group() helpers end here *********************/ | 4943 | /******* find_busiest_group() helpers end here *********************/ |
@@ -4866,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4866 | * to restore balance. | 4953 | * to restore balance. |
4867 | * | 4954 | * |
4868 | * @env: The load balancing environment. | 4955 | * @env: The load balancing environment. |
4869 | * @balance: Pointer to a variable indicating if this_cpu | ||
4870 | * is the appropriate cpu to perform load balancing at this_level. | ||
4871 | * | 4956 | * |
4872 | * Returns: - the busiest group if imbalance exists. | 4957 | * Return: - The busiest group if imbalance exists. |
4873 | * - If no imbalance and user has opted for power-savings balance, | 4958 | * - If no imbalance and user has opted for power-savings balance, |
4874 | * return the least loaded group whose CPUs can be | 4959 | * return the least loaded group whose CPUs can be |
4875 | * put to idle by rebalancing its tasks onto our group. | 4960 | * put to idle by rebalancing its tasks onto our group. |
4876 | */ | 4961 | */ |
4877 | static struct sched_group * | 4962 | static struct sched_group *find_busiest_group(struct lb_env *env) |
4878 | find_busiest_group(struct lb_env *env, int *balance) | ||
4879 | { | 4963 | { |
4964 | struct sg_lb_stats *local, *busiest; | ||
4880 | struct sd_lb_stats sds; | 4965 | struct sd_lb_stats sds; |
4881 | 4966 | ||
4882 | memset(&sds, 0, sizeof(sds)); | 4967 | init_sd_lb_stats(&sds); |
4883 | 4968 | ||
4884 | /* | 4969 | /* |
4885 | * Compute the various statistics relavent for load balancing at | 4970 | * Compute the various statistics relavent for load balancing at |
4886 | * this level. | 4971 | * this level. |
4887 | */ | 4972 | */ |
4888 | update_sd_lb_stats(env, balance, &sds); | 4973 | update_sd_lb_stats(env, &sds); |
4889 | 4974 | local = &sds.local_stat; | |
4890 | /* | 4975 | busiest = &sds.busiest_stat; |
4891 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
4892 | * this level. | ||
4893 | */ | ||
4894 | if (!(*balance)) | ||
4895 | goto ret; | ||
4896 | 4976 | ||
4897 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4977 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4898 | check_asym_packing(env, &sds)) | 4978 | check_asym_packing(env, &sds)) |
4899 | return sds.busiest; | 4979 | return sds.busiest; |
4900 | 4980 | ||
4901 | /* There is no busy sibling group to pull tasks from */ | 4981 | /* There is no busy sibling group to pull tasks from */ |
4902 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4982 | if (!sds.busiest || busiest->sum_nr_running == 0) |
4903 | goto out_balanced; | 4983 | goto out_balanced; |
4904 | 4984 | ||
4905 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4985 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
4906 | 4986 | ||
4907 | /* | 4987 | /* |
4908 | * If the busiest group is imbalanced the below checks don't | 4988 | * If the busiest group is imbalanced the below checks don't |
4909 | * work because they assumes all things are equal, which typically | 4989 | * work because they assume all things are equal, which typically |
4910 | * isn't true due to cpus_allowed constraints and the like. | 4990 | * isn't true due to cpus_allowed constraints and the like. |
4911 | */ | 4991 | */ |
4912 | if (sds.group_imb) | 4992 | if (busiest->group_imb) |
4913 | goto force_balance; | 4993 | goto force_balance; |
4914 | 4994 | ||
4915 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4995 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4916 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4996 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
4917 | !sds.busiest_has_capacity) | 4997 | !busiest->group_has_capacity) |
4918 | goto force_balance; | 4998 | goto force_balance; |
4919 | 4999 | ||
4920 | /* | 5000 | /* |
4921 | * If the local group is more busy than the selected busiest group | 5001 | * If the local group is more busy than the selected busiest group |
4922 | * don't try and pull any tasks. | 5002 | * don't try and pull any tasks. |
4923 | */ | 5003 | */ |
4924 | if (sds.this_load >= sds.max_load) | 5004 | if (local->avg_load >= busiest->avg_load) |
4925 | goto out_balanced; | 5005 | goto out_balanced; |
4926 | 5006 | ||
4927 | /* | 5007 | /* |
4928 | * Don't pull any tasks if this group is already above the domain | 5008 | * Don't pull any tasks if this group is already above the domain |
4929 | * average load. | 5009 | * average load. |
4930 | */ | 5010 | */ |
4931 | if (sds.this_load >= sds.avg_load) | 5011 | if (local->avg_load >= sds.avg_load) |
4932 | goto out_balanced; | 5012 | goto out_balanced; |
4933 | 5013 | ||
4934 | if (env->idle == CPU_IDLE) { | 5014 | if (env->idle == CPU_IDLE) { |
@@ -4938,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
4938 | * there is no imbalance between this and busiest group | 5018 | * there is no imbalance between this and busiest group |
4939 | * wrt to idle cpu's, it is balanced. | 5019 | * wrt to idle cpu's, it is balanced. |
4940 | */ | 5020 | */ |
4941 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5021 | if ((local->idle_cpus < busiest->idle_cpus) && |
4942 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5022 | busiest->sum_nr_running <= busiest->group_weight) |
4943 | goto out_balanced; | 5023 | goto out_balanced; |
4944 | } else { | 5024 | } else { |
4945 | /* | 5025 | /* |
4946 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5026 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4947 | * imbalance_pct to be conservative. | 5027 | * imbalance_pct to be conservative. |
4948 | */ | 5028 | */ |
4949 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5029 | if (100 * busiest->avg_load <= |
5030 | env->sd->imbalance_pct * local->avg_load) | ||
4950 | goto out_balanced; | 5031 | goto out_balanced; |
4951 | } | 5032 | } |
4952 | 5033 | ||
@@ -4956,7 +5037,6 @@ force_balance: | |||
4956 | return sds.busiest; | 5037 | return sds.busiest; |
4957 | 5038 | ||
4958 | out_balanced: | 5039 | out_balanced: |
4959 | ret: | ||
4960 | env->imbalance = 0; | 5040 | env->imbalance = 0; |
4961 | return NULL; | 5041 | return NULL; |
4962 | } | 5042 | } |
@@ -4968,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4968 | struct sched_group *group) | 5048 | struct sched_group *group) |
4969 | { | 5049 | { |
4970 | struct rq *busiest = NULL, *rq; | 5050 | struct rq *busiest = NULL, *rq; |
4971 | unsigned long max_load = 0; | 5051 | unsigned long busiest_load = 0, busiest_power = 1; |
4972 | int i; | 5052 | int i; |
4973 | 5053 | ||
4974 | for_each_cpu(i, sched_group_cpus(group)) { | 5054 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4975 | unsigned long power = power_of(i); | 5055 | unsigned long power = power_of(i); |
4976 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5056 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
4977 | SCHED_POWER_SCALE); | 5057 | SCHED_POWER_SCALE); |
@@ -4980,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4980 | if (!capacity) | 5060 | if (!capacity) |
4981 | capacity = fix_small_capacity(env->sd, group); | 5061 | capacity = fix_small_capacity(env->sd, group); |
4982 | 5062 | ||
4983 | if (!cpumask_test_cpu(i, env->cpus)) | ||
4984 | continue; | ||
4985 | |||
4986 | rq = cpu_rq(i); | 5063 | rq = cpu_rq(i); |
4987 | wl = weighted_cpuload(i); | 5064 | wl = weighted_cpuload(i); |
4988 | 5065 | ||
@@ -4998,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4998 | * the weighted_cpuload() scaled with the cpu power, so that | 5075 | * the weighted_cpuload() scaled with the cpu power, so that |
4999 | * the load can be moved away from the cpu that is potentially | 5076 | * the load can be moved away from the cpu that is potentially |
5000 | * running at a lower capacity. | 5077 | * running at a lower capacity. |
5078 | * | ||
5079 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
5080 | * multiplication to rid ourselves of the division works out | ||
5081 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
5082 | * previous maximum. | ||
5001 | */ | 5083 | */ |
5002 | wl = (wl * SCHED_POWER_SCALE) / power; | 5084 | if (wl * busiest_power > busiest_load * power) { |
5003 | 5085 | busiest_load = wl; | |
5004 | if (wl > max_load) { | 5086 | busiest_power = power; |
5005 | max_load = wl; | ||
5006 | busiest = rq; | 5087 | busiest = rq; |
5007 | } | 5088 | } |
5008 | } | 5089 | } |
@@ -5039,13 +5120,47 @@ static int need_active_balance(struct lb_env *env) | |||
5039 | 5120 | ||
5040 | static int active_load_balance_cpu_stop(void *data); | 5121 | static int active_load_balance_cpu_stop(void *data); |
5041 | 5122 | ||
5123 | static int should_we_balance(struct lb_env *env) | ||
5124 | { | ||
5125 | struct sched_group *sg = env->sd->groups; | ||
5126 | struct cpumask *sg_cpus, *sg_mask; | ||
5127 | int cpu, balance_cpu = -1; | ||
5128 | |||
5129 | /* | ||
5130 | * In the newly idle case, we will allow all the cpu's | ||
5131 | * to do the newly idle load balance. | ||
5132 | */ | ||
5133 | if (env->idle == CPU_NEWLY_IDLE) | ||
5134 | return 1; | ||
5135 | |||
5136 | sg_cpus = sched_group_cpus(sg); | ||
5137 | sg_mask = sched_group_mask(sg); | ||
5138 | /* Try to find first idle cpu */ | ||
5139 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
5140 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
5141 | continue; | ||
5142 | |||
5143 | balance_cpu = cpu; | ||
5144 | break; | ||
5145 | } | ||
5146 | |||
5147 | if (balance_cpu == -1) | ||
5148 | balance_cpu = group_balance_cpu(sg); | ||
5149 | |||
5150 | /* | ||
5151 | * First idle cpu or the first cpu(busiest) in this sched group | ||
5152 | * is eligible for doing load balancing at this and above domains. | ||
5153 | */ | ||
5154 | return balance_cpu != env->dst_cpu; | ||
5155 | } | ||
5156 | |||
5042 | /* | 5157 | /* |
5043 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5158 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
5044 | * tasks if there is an imbalance. | 5159 | * tasks if there is an imbalance. |
5045 | */ | 5160 | */ |
5046 | static int load_balance(int this_cpu, struct rq *this_rq, | 5161 | static int load_balance(int this_cpu, struct rq *this_rq, |
5047 | struct sched_domain *sd, enum cpu_idle_type idle, | 5162 | struct sched_domain *sd, enum cpu_idle_type idle, |
5048 | int *balance) | 5163 | int *continue_balancing) |
5049 | { | 5164 | { |
5050 | int ld_moved, cur_ld_moved, active_balance = 0; | 5165 | int ld_moved, cur_ld_moved, active_balance = 0; |
5051 | struct sched_group *group; | 5166 | struct sched_group *group; |
@@ -5075,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5075 | schedstat_inc(sd, lb_count[idle]); | 5190 | schedstat_inc(sd, lb_count[idle]); |
5076 | 5191 | ||
5077 | redo: | 5192 | redo: |
5078 | group = find_busiest_group(&env, balance); | 5193 | if (!should_we_balance(&env)) { |
5079 | 5194 | *continue_balancing = 0; | |
5080 | if (*balance == 0) | ||
5081 | goto out_balanced; | 5195 | goto out_balanced; |
5196 | } | ||
5082 | 5197 | ||
5198 | group = find_busiest_group(&env); | ||
5083 | if (!group) { | 5199 | if (!group) { |
5084 | schedstat_inc(sd, lb_nobusyg[idle]); | 5200 | schedstat_inc(sd, lb_nobusyg[idle]); |
5085 | goto out_balanced; | 5201 | goto out_balanced; |
@@ -5108,7 +5224,6 @@ redo: | |||
5108 | env.src_rq = busiest; | 5224 | env.src_rq = busiest; |
5109 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 5225 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
5110 | 5226 | ||
5111 | update_h_load(env.src_cpu); | ||
5112 | more_balance: | 5227 | more_balance: |
5113 | local_irq_save(flags); | 5228 | local_irq_save(flags); |
5114 | double_rq_lock(env.dst_rq, busiest); | 5229 | double_rq_lock(env.dst_rq, busiest); |
@@ -5292,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5292 | rcu_read_lock(); | 5407 | rcu_read_lock(); |
5293 | for_each_domain(this_cpu, sd) { | 5408 | for_each_domain(this_cpu, sd) { |
5294 | unsigned long interval; | 5409 | unsigned long interval; |
5295 | int balance = 1; | 5410 | int continue_balancing = 1; |
5296 | 5411 | ||
5297 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5412 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5298 | continue; | 5413 | continue; |
@@ -5300,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5300 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5415 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
5301 | /* If we've pulled tasks over stop searching: */ | 5416 | /* If we've pulled tasks over stop searching: */ |
5302 | pulled_task = load_balance(this_cpu, this_rq, | 5417 | pulled_task = load_balance(this_cpu, this_rq, |
5303 | sd, CPU_NEWLY_IDLE, &balance); | 5418 | sd, CPU_NEWLY_IDLE, |
5419 | &continue_balancing); | ||
5304 | } | 5420 | } |
5305 | 5421 | ||
5306 | interval = msecs_to_jiffies(sd->balance_interval); | 5422 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5506,7 +5622,7 @@ void nohz_balance_enter_idle(int cpu) | |||
5506 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 5622 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
5507 | } | 5623 | } |
5508 | 5624 | ||
5509 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 5625 | static int sched_ilb_notifier(struct notifier_block *nfb, |
5510 | unsigned long action, void *hcpu) | 5626 | unsigned long action, void *hcpu) |
5511 | { | 5627 | { |
5512 | switch (action & ~CPU_TASKS_FROZEN) { | 5628 | switch (action & ~CPU_TASKS_FROZEN) { |
@@ -5538,7 +5654,7 @@ void update_max_interval(void) | |||
5538 | */ | 5654 | */ |
5539 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5540 | { | 5656 | { |
5541 | int balance = 1; | 5657 | int continue_balancing = 1; |
5542 | struct rq *rq = cpu_rq(cpu); | 5658 | struct rq *rq = cpu_rq(cpu); |
5543 | unsigned long interval; | 5659 | unsigned long interval; |
5544 | struct sched_domain *sd; | 5660 | struct sched_domain *sd; |
@@ -5570,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5570 | } | 5686 | } |
5571 | 5687 | ||
5572 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5688 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5573 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5689 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5574 | /* | 5690 | /* |
5575 | * The LBF_SOME_PINNED logic could have changed | 5691 | * The LBF_SOME_PINNED logic could have changed |
5576 | * env->dst_cpu, so we can't know our idle | 5692 | * env->dst_cpu, so we can't know our idle |
@@ -5593,7 +5709,7 @@ out: | |||
5593 | * CPU in our sched group which is doing load balancing more | 5709 | * CPU in our sched group which is doing load balancing more |
5594 | * actively. | 5710 | * actively. |
5595 | */ | 5711 | */ |
5596 | if (!balance) | 5712 | if (!continue_balancing) |
5597 | break; | 5713 | break; |
5598 | } | 5714 | } |
5599 | rcu_read_unlock(); | 5715 | rcu_read_unlock(); |
@@ -5786,7 +5902,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
5786 | entity_tick(cfs_rq, se, queued); | 5902 | entity_tick(cfs_rq, se, queued); |
5787 | } | 5903 | } |
5788 | 5904 | ||
5789 | if (sched_feat_numa(NUMA)) | 5905 | if (numabalancing_enabled) |
5790 | task_tick_numa(rq, curr); | 5906 | task_tick_numa(rq, curr); |
5791 | 5907 | ||
5792 | update_rq_runnable_avg(rq, 1); | 5908 | update_rq_runnable_avg(rq, 1); |
@@ -5889,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5889 | * and ensure we don't carry in an old decay_count if we | 6005 | * and ensure we don't carry in an old decay_count if we |
5890 | * switch back. | 6006 | * switch back. |
5891 | */ | 6007 | */ |
5892 | if (p->se.avg.decay_count) { | 6008 | if (se->avg.decay_count) { |
5893 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6009 | __synchronize_entity_decay(se); |
5894 | __synchronize_entity_decay(&p->se); | 6010 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
5895 | subtract_blocked_load_contrib(cfs_rq, | ||
5896 | p->se.avg.load_avg_contrib); | ||
5897 | } | 6011 | } |
5898 | #endif | 6012 | #endif |
5899 | } | 6013 | } |