aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c632
1 files changed, 373 insertions, 259 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c527449..7f0a5e6cdae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)
851{ 851{
852 struct task_struct *p = current; 852 struct task_struct *p = current;
853 853
854 if (!sched_feat_numa(NUMA)) 854 if (!numabalancing_enabled)
855 return; 855 return;
856 856
857 /* FIXME: Allocate task-specific structure for placement policy here */ 857 /* FIXME: Allocate task-specific structure for placement policy here */
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2032 */ 2032 */
2033 update_entity_load_avg(curr, 1); 2033 update_entity_load_avg(curr, 1);
2034 update_cfs_rq_blocked_load(cfs_rq, 1); 2034 update_cfs_rq_blocked_load(cfs_rq, 1);
2035 update_cfs_shares(cfs_rq);
2035 2036
2036#ifdef CONFIG_SCHED_HRTICK 2037#ifdef CONFIG_SCHED_HRTICK
2037 /* 2038 /*
@@ -3017,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3017 return 0; 3018 return 0;
3018} 3019}
3019 3020
3021static void record_wakee(struct task_struct *p)
3022{
3023 /*
3024 * Rough decay (wiping) for cost saving, don't worry
3025 * about the boundary, really active task won't care
3026 * about the loss.
3027 */
3028 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3029 current->wakee_flips = 0;
3030 current->wakee_flip_decay_ts = jiffies;
3031 }
3032
3033 if (current->last_wakee != p) {
3034 current->last_wakee = p;
3035 current->wakee_flips++;
3036 }
3037}
3020 3038
3021static void task_waking_fair(struct task_struct *p) 3039static void task_waking_fair(struct task_struct *p)
3022{ 3040{
@@ -3037,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
3037#endif 3055#endif
3038 3056
3039 se->vruntime -= min_vruntime; 3057 se->vruntime -= min_vruntime;
3058 record_wakee(p);
3040} 3059}
3041 3060
3042#ifdef CONFIG_FAIR_GROUP_SCHED 3061#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3155,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
3155 3174
3156#endif 3175#endif
3157 3176
3177static int wake_wide(struct task_struct *p)
3178{
3179 int factor = this_cpu_read(sd_llc_size);
3180
3181 /*
3182 * Yeah, it's the switching-frequency, could means many wakee or
3183 * rapidly switch, use factor here will just help to automatically
3184 * adjust the loose-degree, so bigger node will lead to more pull.
3185 */
3186 if (p->wakee_flips > factor) {
3187 /*
3188 * wakee is somewhat hot, it needs certain amount of cpu
3189 * resource, so if waker is far more hot, prefer to leave
3190 * it alone.
3191 */
3192 if (current->wakee_flips > (factor * p->wakee_flips))
3193 return 1;
3194 }
3195
3196 return 0;
3197}
3198
3158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3199static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3159{ 3200{
3160 s64 this_load, load; 3201 s64 this_load, load;
@@ -3164,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3164 unsigned long weight; 3205 unsigned long weight;
3165 int balanced; 3206 int balanced;
3166 3207
3208 /*
3209 * If we wake multiple tasks be careful to not bounce
3210 * ourselves around too much.
3211 */
3212 if (wake_wide(p))
3213 return 0;
3214
3167 idx = sd->wake_idx; 3215 idx = sd->wake_idx;
3168 this_cpu = smp_processor_id(); 3216 this_cpu = smp_processor_id();
3169 prev_cpu = task_cpu(p); 3217 prev_cpu = task_cpu(p);
@@ -4171,47 +4219,48 @@ static void update_blocked_averages(int cpu)
4171} 4219}
4172 4220
4173/* 4221/*
4174 * Compute the cpu's hierarchical load factor for each task group. 4222 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4175 * This needs to be done in a top-down fashion because the load of a child 4223 * This needs to be done in a top-down fashion because the load of a child
4176 * group is a fraction of its parents load. 4224 * group is a fraction of its parents load.
4177 */ 4225 */
4178static int tg_load_down(struct task_group *tg, void *data) 4226static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4179{
4180 unsigned long load;
4181 long cpu = (long)data;
4182
4183 if (!tg->parent) {
4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4185 } else {
4186 load = tg->parent->cfs_rq[cpu]->h_load;
4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4189 }
4190
4191 tg->cfs_rq[cpu]->h_load = load;
4192
4193 return 0;
4194}
4195
4196static void update_h_load(long cpu)
4197{ 4227{
4198 struct rq *rq = cpu_rq(cpu); 4228 struct rq *rq = rq_of(cfs_rq);
4229 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4199 unsigned long now = jiffies; 4230 unsigned long now = jiffies;
4231 unsigned long load;
4200 4232
4201 if (rq->h_load_throttle == now) 4233 if (cfs_rq->last_h_load_update == now)
4202 return; 4234 return;
4203 4235
4204 rq->h_load_throttle = now; 4236 cfs_rq->h_load_next = NULL;
4237 for_each_sched_entity(se) {
4238 cfs_rq = cfs_rq_of(se);
4239 cfs_rq->h_load_next = se;
4240 if (cfs_rq->last_h_load_update == now)
4241 break;
4242 }
4205 4243
4206 rcu_read_lock(); 4244 if (!se) {
4207 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 4245 cfs_rq->h_load = rq->avg.load_avg_contrib;
4208 rcu_read_unlock(); 4246 cfs_rq->last_h_load_update = now;
4247 }
4248
4249 while ((se = cfs_rq->h_load_next) != NULL) {
4250 load = cfs_rq->h_load;
4251 load = div64_ul(load * se->avg.load_avg_contrib,
4252 cfs_rq->runnable_load_avg + 1);
4253 cfs_rq = group_cfs_rq(se);
4254 cfs_rq->h_load = load;
4255 cfs_rq->last_h_load_update = now;
4256 }
4209} 4257}
4210 4258
4211static unsigned long task_h_load(struct task_struct *p) 4259static unsigned long task_h_load(struct task_struct *p)
4212{ 4260{
4213 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4261 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4214 4262
4263 update_cfs_rq_h_load(cfs_rq);
4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 4264 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1); 4265 cfs_rq->runnable_load_avg + 1);
4217} 4266}
@@ -4220,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
4220{ 4269{
4221} 4270}
4222 4271
4223static inline void update_h_load(long cpu)
4224{
4225}
4226
4227static unsigned long task_h_load(struct task_struct *p) 4272static unsigned long task_h_load(struct task_struct *p)
4228{ 4273{
4229 return p->se.avg.load_avg_contrib; 4274 return p->se.avg.load_avg_contrib;
@@ -4232,54 +4277,62 @@ static unsigned long task_h_load(struct task_struct *p)
4232 4277
4233/********** Helpers for find_busiest_group ************************/ 4278/********** Helpers for find_busiest_group ************************/
4234/* 4279/*
4235 * sd_lb_stats - Structure to store the statistics of a sched_domain
4236 * during load balancing.
4237 */
4238struct sd_lb_stats {
4239 struct sched_group *busiest; /* Busiest group in this sd */
4240 struct sched_group *this; /* Local group in this sd */
4241 unsigned long total_load; /* Total load of all groups in sd */
4242 unsigned long total_pwr; /* Total power of all groups in sd */
4243 unsigned long avg_load; /* Average load across all groups in sd */
4244
4245 /** Statistics of this group */
4246 unsigned long this_load;
4247 unsigned long this_load_per_task;
4248 unsigned long this_nr_running;
4249 unsigned long this_has_capacity;
4250 unsigned int this_idle_cpus;
4251
4252 /* Statistics of the busiest group */
4253 unsigned int busiest_idle_cpus;
4254 unsigned long max_load;
4255 unsigned long busiest_load_per_task;
4256 unsigned long busiest_nr_running;
4257 unsigned long busiest_group_capacity;
4258 unsigned long busiest_has_capacity;
4259 unsigned int busiest_group_weight;
4260
4261 int group_imb; /* Is there imbalance in this sd */
4262};
4263
4264/*
4265 * sg_lb_stats - stats of a sched_group required for load_balancing 4280 * sg_lb_stats - stats of a sched_group required for load_balancing
4266 */ 4281 */
4267struct sg_lb_stats { 4282struct sg_lb_stats {
4268 unsigned long avg_load; /*Avg load across the CPUs of the group */ 4283 unsigned long avg_load; /*Avg load across the CPUs of the group */
4269 unsigned long group_load; /* Total load over the CPUs of the group */ 4284 unsigned long group_load; /* Total load over the CPUs of the group */
4270 unsigned long sum_nr_running; /* Nr tasks running in the group */
4271 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 4285 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4272 unsigned long group_capacity; 4286 unsigned long load_per_task;
4273 unsigned long idle_cpus; 4287 unsigned long group_power;
4274 unsigned long group_weight; 4288 unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 unsigned int group_capacity;
4290 unsigned int idle_cpus;
4291 unsigned int group_weight;
4275 int group_imb; /* Is there an imbalance in the group ? */ 4292 int group_imb; /* Is there an imbalance in the group ? */
4276 int group_has_capacity; /* Is there extra capacity in the group? */ 4293 int group_has_capacity; /* Is there extra capacity in the group? */
4277}; 4294};
4278 4295
4296/*
4297 * sd_lb_stats - Structure to store the statistics of a sched_domain
4298 * during load balancing.
4299 */
4300struct sd_lb_stats {
4301 struct sched_group *busiest; /* Busiest group in this sd */
4302 struct sched_group *local; /* Local group in this sd */
4303 unsigned long total_load; /* Total load of all groups in sd */
4304 unsigned long total_pwr; /* Total power of all groups in sd */
4305 unsigned long avg_load; /* Average load across all groups in sd */
4306
4307 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 struct sg_lb_stats local_stat; /* Statistics of the local group */
4309};
4310
4311static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312{
4313 /*
4314 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 * We must however clear busiest_stat::avg_load because
4317 * update_sd_pick_busiest() reads this before assignment.
4318 */
4319 *sds = (struct sd_lb_stats){
4320 .busiest = NULL,
4321 .local = NULL,
4322 .total_load = 0UL,
4323 .total_pwr = 0UL,
4324 .busiest_stat = {
4325 .avg_load = 0UL,
4326 },
4327 };
4328}
4329
4279/** 4330/**
4280 * get_sd_load_idx - Obtain the load index for a given sched domain. 4331 * get_sd_load_idx - Obtain the load index for a given sched domain.
4281 * @sd: The sched_domain whose load_idx is to be obtained. 4332 * @sd: The sched_domain whose load_idx is to be obtained.
4282 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4334 *
4335 * Return: The load index.
4283 */ 4336 */
4284static inline int get_sd_load_idx(struct sched_domain *sd, 4337static inline int get_sd_load_idx(struct sched_domain *sd,
4285 enum cpu_idle_type idle) 4338 enum cpu_idle_type idle)
@@ -4457,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4457 return 0; 4510 return 0;
4458} 4511}
4459 4512
4513/*
4514 * Group imbalance indicates (and tries to solve) the problem where balancing
4515 * groups is inadequate due to tsk_cpus_allowed() constraints.
4516 *
4517 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519 * Something like:
4520 *
4521 * { 0 1 2 3 } { 4 5 6 7 }
4522 * * * * *
4523 *
4524 * If we were to balance group-wise we'd place two tasks in the first group and
4525 * two tasks in the second group. Clearly this is undesired as it will overload
4526 * cpu 3 and leave one of the cpus in the second group unused.
4527 *
4528 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see
4531 * sg_imbalanced().
4532 *
4533 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it
4536 * to create an effective group imbalance.
4537 *
4538 * This is a somewhat tricky proposition since the next run might not find the
4539 * group imbalance and decide the groups need to be balanced again. A most
4540 * subtle and fragile situation.
4541 */
4542
4543struct sg_imb_stats {
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552}
4553
4554static inline void
4555update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 unsigned long load, unsigned long nr_running)
4557{
4558 if (load > sgi->max_cpu_load)
4559 sgi->max_cpu_load = load;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562
4563 if (nr_running > sgi->max_nr_running)
4564 sgi->max_nr_running = nr_running;
4565 if (sgi->min_nr_running > nr_running)
4566 sgi->min_nr_running = nr_running;
4567}
4568
4569static inline int
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571{
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584
4585 return 0;
4586}
4587
4460/** 4588/**
4461 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4589 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4462 * @env: The load balancing environment. 4590 * @env: The load balancing environment.
4463 * @group: sched_group whose statistics are to be updated. 4591 * @group: sched_group whose statistics are to be updated.
4464 * @load_idx: Load index of sched_domain of this_cpu for load calc. 4592 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4465 * @local_group: Does group contain this_cpu. 4593 * @local_group: Does group contain this_cpu.
4466 * @balance: Should we balance.
4467 * @sgs: variable to hold the statistics for this group. 4594 * @sgs: variable to hold the statistics for this group.
4468 */ 4595 */
4469static inline void update_sg_lb_stats(struct lb_env *env, 4596static inline void update_sg_lb_stats(struct lb_env *env,
4470 struct sched_group *group, int load_idx, 4597 struct sched_group *group, int load_idx,
4471 int local_group, int *balance, struct sg_lb_stats *sgs) 4598 int local_group, struct sg_lb_stats *sgs)
4472{ 4599{
4473 unsigned long nr_running, max_nr_running, min_nr_running; 4600 struct sg_imb_stats sgi;
4474 unsigned long load, max_cpu_load, min_cpu_load; 4601 unsigned long nr_running;
4475 unsigned int balance_cpu = -1, first_idle_cpu = 0; 4602 unsigned long load;
4476 unsigned long avg_load_per_task = 0;
4477 int i; 4603 int i;
4478 4604
4479 if (local_group) 4605 init_sg_imb_stats(&sgi);
4480 balance_cpu = group_balance_cpu(group);
4481
4482 /* Tally up the load of all CPUs in the group */
4483 max_cpu_load = 0;
4484 min_cpu_load = ~0UL;
4485 max_nr_running = 0;
4486 min_nr_running = ~0UL;
4487 4606
4488 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4489 struct rq *rq = cpu_rq(i); 4608 struct rq *rq = cpu_rq(i);
@@ -4492,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4492 4611
4493 /* Bias balancing toward cpus of our domain */ 4612 /* Bias balancing toward cpus of our domain */
4494 if (local_group) { 4613 if (local_group) {
4495 if (idle_cpu(i) && !first_idle_cpu &&
4496 cpumask_test_cpu(i, sched_group_mask(group))) {
4497 first_idle_cpu = 1;
4498 balance_cpu = i;
4499 }
4500
4501 load = target_load(i, load_idx); 4614 load = target_load(i, load_idx);
4502 } else { 4615 } else {
4503 load = source_load(i, load_idx); 4616 load = source_load(i, load_idx);
4504 if (load > max_cpu_load) 4617 update_sg_imb_stats(&sgi, load, nr_running);
4505 max_cpu_load = load;
4506 if (min_cpu_load > load)
4507 min_cpu_load = load;
4508
4509 if (nr_running > max_nr_running)
4510 max_nr_running = nr_running;
4511 if (min_nr_running > nr_running)
4512 min_nr_running = nr_running;
4513 } 4618 }
4514 4619
4515 sgs->group_load += load; 4620 sgs->group_load += load;
@@ -4519,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4519 sgs->idle_cpus++; 4624 sgs->idle_cpus++;
4520 } 4625 }
4521 4626
4522 /* 4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4523 * First idle cpu or the first cpu(busiest) in this sched group 4628 time_after_eq(jiffies, group->sgp->next_update)))
4524 * is eligible for doing load balancing at this and above 4629 update_group_power(env->sd, env->dst_cpu);
4525 * domains. In the newly idle case, we will allow all the cpu's
4526 * to do the newly idle load balance.
4527 */
4528 if (local_group) {
4529 if (env->idle != CPU_NEWLY_IDLE) {
4530 if (balance_cpu != env->dst_cpu) {
4531 *balance = 0;
4532 return;
4533 }
4534 update_group_power(env->sd, env->dst_cpu);
4535 } else if (time_after_eq(jiffies, group->sgp->next_update))
4536 update_group_power(env->sd, env->dst_cpu);
4537 }
4538 4630
4539 /* Adjust by relative CPU power of the group */ 4631 /* Adjust by relative CPU power of the group */
4540 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 4632 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4541 4634
4542 /*
4543 * Consider the group unbalanced when the imbalance is larger
4544 * than the average weight of a task.
4545 *
4546 * APZ: with cgroup the avg task weight can vary wildly and
4547 * might not be a suitable number - should we keep a
4548 * normalized nr_running number somewhere that negates
4549 * the hierarchy?
4550 */
4551 if (sgs->sum_nr_running) 4635 if (sgs->sum_nr_running)
4552 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4553 4639
4554 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && 4640 sgs->group_capacity =
4555 (max_nr_running - min_nr_running) > 1) 4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4556 sgs->group_imb = 1;
4557 4642
4558 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4559 SCHED_POWER_SCALE);
4560 if (!sgs->group_capacity) 4643 if (!sgs->group_capacity)
4561 sgs->group_capacity = fix_small_capacity(env->sd, group); 4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4562 sgs->group_weight = group->group_weight; 4646 sgs->group_weight = group->group_weight;
4563 4647
4564 if (sgs->group_capacity > sgs->sum_nr_running) 4648 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4574,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4574 * 4658 *
4575 * Determine if @sg is a busier group than the previously selected 4659 * Determine if @sg is a busier group than the previously selected
4576 * busiest group. 4660 * busiest group.
4661 *
4662 * Return: %true if @sg is a busier group than the previously selected
4663 * busiest group. %false otherwise.
4577 */ 4664 */
4578static bool update_sd_pick_busiest(struct lb_env *env, 4665static bool update_sd_pick_busiest(struct lb_env *env,
4579 struct sd_lb_stats *sds, 4666 struct sd_lb_stats *sds,
4580 struct sched_group *sg, 4667 struct sched_group *sg,
4581 struct sg_lb_stats *sgs) 4668 struct sg_lb_stats *sgs)
4582{ 4669{
4583 if (sgs->avg_load <= sds->max_load) 4670 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4584 return false; 4671 return false;
4585 4672
4586 if (sgs->sum_nr_running > sgs->group_capacity) 4673 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4613,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4613 * @sds: variable to hold the statistics for this sched_domain. 4700 * @sds: variable to hold the statistics for this sched_domain.
4614 */ 4701 */
4615static inline void update_sd_lb_stats(struct lb_env *env, 4702static inline void update_sd_lb_stats(struct lb_env *env,
4616 int *balance, struct sd_lb_stats *sds) 4703 struct sd_lb_stats *sds)
4617{ 4704{
4618 struct sched_domain *child = env->sd->child; 4705 struct sched_domain *child = env->sd->child;
4619 struct sched_group *sg = env->sd->groups; 4706 struct sched_group *sg = env->sd->groups;
4620 struct sg_lb_stats sgs; 4707 struct sg_lb_stats tmp_sgs;
4621 int load_idx, prefer_sibling = 0; 4708 int load_idx, prefer_sibling = 0;
4622 4709
4623 if (child && child->flags & SD_PREFER_SIBLING) 4710 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4626,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4626 load_idx = get_sd_load_idx(env->sd, env->idle); 4713 load_idx = get_sd_load_idx(env->sd, env->idle);
4627 4714
4628 do { 4715 do {
4716 struct sg_lb_stats *sgs = &tmp_sgs;
4629 int local_group; 4717 int local_group;
4630 4718
4631 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 4719 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4632 memset(&sgs, 0, sizeof(sgs)); 4720 if (local_group) {
4633 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 4721 sds->local = sg;
4634 4722 sgs = &sds->local_stat;
4635 if (local_group && !(*balance)) 4723 }
4636 return;
4637 4724
4638 sds->total_load += sgs.group_load; 4725 memset(sgs, 0, sizeof(*sgs));
4639 sds->total_pwr += sg->sgp->power; 4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4640 4727
4641 /* 4728 /*
4642 * In case the child domain prefers tasks go to siblings 4729 * In case the child domain prefers tasks go to siblings
@@ -4648,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4648 * heaviest group when it is already under-utilized (possible 4735 * heaviest group when it is already under-utilized (possible
4649 * with a large weight task outweighs the tasks on the system). 4736 * with a large weight task outweighs the tasks on the system).
4650 */ 4737 */
4651 if (prefer_sibling && !local_group && sds->this_has_capacity) 4738 if (prefer_sibling && !local_group &&
4652 sgs.group_capacity = min(sgs.group_capacity, 1UL); 4739 sds->local && sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U);
4653 4741
4654 if (local_group) { 4742 /* Now, start updating sd_lb_stats */
4655 sds->this_load = sgs.avg_load; 4743 sds->total_load += sgs->group_load;
4656 sds->this = sg; 4744 sds->total_pwr += sgs->group_power;
4657 sds->this_nr_running = sgs.sum_nr_running; 4745
4658 sds->this_load_per_task = sgs.sum_weighted_load; 4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4659 sds->this_has_capacity = sgs.group_has_capacity;
4660 sds->this_idle_cpus = sgs.idle_cpus;
4661 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4662 sds->max_load = sgs.avg_load;
4663 sds->busiest = sg; 4747 sds->busiest = sg;
4664 sds->busiest_nr_running = sgs.sum_nr_running; 4748 sds->busiest_stat = *sgs;
4665 sds->busiest_idle_cpus = sgs.idle_cpus;
4666 sds->busiest_group_capacity = sgs.group_capacity;
4667 sds->busiest_load_per_task = sgs.sum_weighted_load;
4668 sds->busiest_has_capacity = sgs.group_has_capacity;
4669 sds->busiest_group_weight = sgs.group_weight;
4670 sds->group_imb = sgs.group_imb;
4671 } 4749 }
4672 4750
4673 sg = sg->next; 4751 sg = sg->next;
@@ -4691,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4691 * assuming lower CPU number will be equivalent to lower a SMT thread 4769 * assuming lower CPU number will be equivalent to lower a SMT thread
4692 * number. 4770 * number.
4693 * 4771 *
4694 * Returns 1 when packing is required and a task should be moved to 4772 * Return: 1 when packing is required and a task should be moved to
4695 * this CPU. The amount of the imbalance is returned in *imbalance. 4773 * this CPU. The amount of the imbalance is returned in *imbalance.
4696 * 4774 *
4697 * @env: The load balancing environment. 4775 * @env: The load balancing environment.
@@ -4712,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4712 return 0; 4790 return 0;
4713 4791
4714 env->imbalance = DIV_ROUND_CLOSEST( 4792 env->imbalance = DIV_ROUND_CLOSEST(
4715 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 4793 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 SCHED_POWER_SCALE);
4716 4795
4717 return 1; 4796 return 1;
4718} 4797}
@@ -4730,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4730 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4809 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4731 unsigned int imbn = 2; 4810 unsigned int imbn = 2;
4732 unsigned long scaled_busy_load_per_task; 4811 unsigned long scaled_busy_load_per_task;
4812 struct sg_lb_stats *local, *busiest;
4733 4813
4734 if (sds->this_nr_running) { 4814 local = &sds->local_stat;
4735 sds->this_load_per_task /= sds->this_nr_running; 4815 busiest = &sds->busiest_stat;
4736 if (sds->busiest_load_per_task > 4816
4737 sds->this_load_per_task) 4817 if (!local->sum_nr_running)
4738 imbn = 1; 4818 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4739 } else { 4819 else if (busiest->load_per_task > local->load_per_task)
4740 sds->this_load_per_task = 4820 imbn = 1;
4741 cpu_avg_load_per_task(env->dst_cpu);
4742 }
4743 4821
4744 scaled_busy_load_per_task = sds->busiest_load_per_task 4822 scaled_busy_load_per_task =
4745 * SCHED_POWER_SCALE; 4823 (busiest->load_per_task * SCHED_POWER_SCALE) /
4746 scaled_busy_load_per_task /= sds->busiest->sgp->power; 4824 busiest->group_power;
4747 4825
4748 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4826 if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
4749 (scaled_busy_load_per_task * imbn)) { 4827 (scaled_busy_load_per_task * imbn)) {
4750 env->imbalance = sds->busiest_load_per_task; 4828 env->imbalance = busiest->load_per_task;
4751 return; 4829 return;
4752 } 4830 }
4753 4831
@@ -4757,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4757 * moving them. 4835 * moving them.
4758 */ 4836 */
4759 4837
4760 pwr_now += sds->busiest->sgp->power * 4838 pwr_now += busiest->group_power *
4761 min(sds->busiest_load_per_task, sds->max_load); 4839 min(busiest->load_per_task, busiest->avg_load);
4762 pwr_now += sds->this->sgp->power * 4840 pwr_now += local->group_power *
4763 min(sds->this_load_per_task, sds->this_load); 4841 min(local->load_per_task, local->avg_load);
4764 pwr_now /= SCHED_POWER_SCALE; 4842 pwr_now /= SCHED_POWER_SCALE;
4765 4843
4766 /* Amount of load we'd subtract */ 4844 /* Amount of load we'd subtract */
4767 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4845 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4768 sds->busiest->sgp->power; 4846 busiest->group_power;
4769 if (sds->max_load > tmp) 4847 if (busiest->avg_load > tmp) {
4770 pwr_move += sds->busiest->sgp->power * 4848 pwr_move += busiest->group_power *
4771 min(sds->busiest_load_per_task, sds->max_load - tmp); 4849 min(busiest->load_per_task,
4850 busiest->avg_load - tmp);
4851 }
4772 4852
4773 /* Amount of load we'd add */ 4853 /* Amount of load we'd add */
4774 if (sds->max_load * sds->busiest->sgp->power < 4854 if (busiest->avg_load * busiest->group_power <
4775 sds->busiest_load_per_task * SCHED_POWER_SCALE) 4855 busiest->load_per_task * SCHED_POWER_SCALE) {
4776 tmp = (sds->max_load * sds->busiest->sgp->power) / 4856 tmp = (busiest->avg_load * busiest->group_power) /
4777 sds->this->sgp->power; 4857 local->group_power;
4778 else 4858 } else {
4779 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4859 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4780 sds->this->sgp->power; 4860 local->group_power;
4781 pwr_move += sds->this->sgp->power * 4861 }
4782 min(sds->this_load_per_task, sds->this_load + tmp); 4862 pwr_move += local->group_power *
4863 min(local->load_per_task, local->avg_load + tmp);
4783 pwr_move /= SCHED_POWER_SCALE; 4864 pwr_move /= SCHED_POWER_SCALE;
4784 4865
4785 /* Move if we gain throughput */ 4866 /* Move if we gain throughput */
4786 if (pwr_move > pwr_now) 4867 if (pwr_move > pwr_now)
4787 env->imbalance = sds->busiest_load_per_task; 4868 env->imbalance = busiest->load_per_task;
4788} 4869}
4789 4870
4790/** 4871/**
@@ -4796,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4796static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 4877static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4797{ 4878{
4798 unsigned long max_pull, load_above_capacity = ~0UL; 4879 unsigned long max_pull, load_above_capacity = ~0UL;
4880 struct sg_lb_stats *local, *busiest;
4799 4881
4800 sds->busiest_load_per_task /= sds->busiest_nr_running; 4882 local = &sds->local_stat;
4801 if (sds->group_imb) { 4883 busiest = &sds->busiest_stat;
4802 sds->busiest_load_per_task = 4884
4803 min(sds->busiest_load_per_task, sds->avg_load); 4885 if (busiest->group_imb) {
4886 /*
4887 * In the group_imb case we cannot rely on group-wide averages
4888 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 */
4890 busiest->load_per_task =
4891 min(busiest->load_per_task, sds->avg_load);
4804 } 4892 }
4805 4893
4806 /* 4894 /*
@@ -4808,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4808 * max load less than avg load(as we skip the groups at or below 4896 * max load less than avg load(as we skip the groups at or below
4809 * its cpu_power, while calculating max_load..) 4897 * its cpu_power, while calculating max_load..)
4810 */ 4898 */
4811 if (sds->max_load < sds->avg_load) { 4899 if (busiest->avg_load < sds->avg_load) {
4812 env->imbalance = 0; 4900 env->imbalance = 0;
4813 return fix_small_imbalance(env, sds); 4901 return fix_small_imbalance(env, sds);
4814 } 4902 }
4815 4903
4816 if (!sds->group_imb) { 4904 if (!busiest->group_imb) {
4817 /* 4905 /*
4818 * Don't want to pull so many tasks that a group would go idle. 4906 * Don't want to pull so many tasks that a group would go idle.
4907 * Except of course for the group_imb case, since then we might
4908 * have to drop below capacity to reach cpu-load equilibrium.
4819 */ 4909 */
4820 load_above_capacity = (sds->busiest_nr_running - 4910 load_above_capacity =
4821 sds->busiest_group_capacity); 4911 (busiest->sum_nr_running - busiest->group_capacity);
4822 4912
4823 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 4913 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4824 4914 load_above_capacity /= busiest->group_power;
4825 load_above_capacity /= sds->busiest->sgp->power;
4826 } 4915 }
4827 4916
4828 /* 4917 /*
@@ -4832,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4832 * we also don't want to reduce the group load below the group capacity 4921 * we also don't want to reduce the group load below the group capacity
4833 * (so that we can implement power-savings policies etc). Thus we look 4922 * (so that we can implement power-savings policies etc). Thus we look
4834 * for the minimum possible imbalance. 4923 * for the minimum possible imbalance.
4835 * Be careful of negative numbers as they'll appear as very large values
4836 * with unsigned longs.
4837 */ 4924 */
4838 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4925 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4839 4926
4840 /* How much load to actually move to equalise the imbalance */ 4927 /* How much load to actually move to equalise the imbalance */
4841 env->imbalance = min(max_pull * sds->busiest->sgp->power, 4928 env->imbalance = min(
4842 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4929 max_pull * busiest->group_power,
4843 / SCHED_POWER_SCALE; 4930 (sds->avg_load - local->avg_load) * local->group_power
4931 ) / SCHED_POWER_SCALE;
4844 4932
4845 /* 4933 /*
4846 * if *imbalance is less than the average load per runnable task 4934 * if *imbalance is less than the average load per runnable task
@@ -4848,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4848 * a think about bumping its value to force at least one task to be 4936 * a think about bumping its value to force at least one task to be
4849 * moved 4937 * moved
4850 */ 4938 */
4851 if (env->imbalance < sds->busiest_load_per_task) 4939 if (env->imbalance < busiest->load_per_task)
4852 return fix_small_imbalance(env, sds); 4940 return fix_small_imbalance(env, sds);
4853
4854} 4941}
4855 4942
4856/******* find_busiest_group() helpers end here *********************/ 4943/******* find_busiest_group() helpers end here *********************/
@@ -4866,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4866 * to restore balance. 4953 * to restore balance.
4867 * 4954 *
4868 * @env: The load balancing environment. 4955 * @env: The load balancing environment.
4869 * @balance: Pointer to a variable indicating if this_cpu
4870 * is the appropriate cpu to perform load balancing at this_level.
4871 * 4956 *
4872 * Returns: - the busiest group if imbalance exists. 4957 * Return: - The busiest group if imbalance exists.
4873 * - If no imbalance and user has opted for power-savings balance, 4958 * - If no imbalance and user has opted for power-savings balance,
4874 * return the least loaded group whose CPUs can be 4959 * return the least loaded group whose CPUs can be
4875 * put to idle by rebalancing its tasks onto our group. 4960 * put to idle by rebalancing its tasks onto our group.
4876 */ 4961 */
4877static struct sched_group * 4962static struct sched_group *find_busiest_group(struct lb_env *env)
4878find_busiest_group(struct lb_env *env, int *balance)
4879{ 4963{
4964 struct sg_lb_stats *local, *busiest;
4880 struct sd_lb_stats sds; 4965 struct sd_lb_stats sds;
4881 4966
4882 memset(&sds, 0, sizeof(sds)); 4967 init_sd_lb_stats(&sds);
4883 4968
4884 /* 4969 /*
4885 * Compute the various statistics relavent for load balancing at 4970 * Compute the various statistics relavent for load balancing at
4886 * this level. 4971 * this level.
4887 */ 4972 */
4888 update_sd_lb_stats(env, balance, &sds); 4973 update_sd_lb_stats(env, &sds);
4889 4974 local = &sds.local_stat;
4890 /* 4975 busiest = &sds.busiest_stat;
4891 * this_cpu is not the appropriate cpu to perform load balancing at
4892 * this level.
4893 */
4894 if (!(*balance))
4895 goto ret;
4896 4976
4897 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 4977 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4898 check_asym_packing(env, &sds)) 4978 check_asym_packing(env, &sds))
4899 return sds.busiest; 4979 return sds.busiest;
4900 4980
4901 /* There is no busy sibling group to pull tasks from */ 4981 /* There is no busy sibling group to pull tasks from */
4902 if (!sds.busiest || sds.busiest_nr_running == 0) 4982 if (!sds.busiest || busiest->sum_nr_running == 0)
4903 goto out_balanced; 4983 goto out_balanced;
4904 4984
4905 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 4985 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4906 4986
4907 /* 4987 /*
4908 * If the busiest group is imbalanced the below checks don't 4988 * If the busiest group is imbalanced the below checks don't
4909 * work because they assumes all things are equal, which typically 4989 * work because they assume all things are equal, which typically
4910 * isn't true due to cpus_allowed constraints and the like. 4990 * isn't true due to cpus_allowed constraints and the like.
4911 */ 4991 */
4912 if (sds.group_imb) 4992 if (busiest->group_imb)
4913 goto force_balance; 4993 goto force_balance;
4914 4994
4915 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4995 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4916 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4996 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4917 !sds.busiest_has_capacity) 4997 !busiest->group_has_capacity)
4918 goto force_balance; 4998 goto force_balance;
4919 4999
4920 /* 5000 /*
4921 * If the local group is more busy than the selected busiest group 5001 * If the local group is more busy than the selected busiest group
4922 * don't try and pull any tasks. 5002 * don't try and pull any tasks.
4923 */ 5003 */
4924 if (sds.this_load >= sds.max_load) 5004 if (local->avg_load >= busiest->avg_load)
4925 goto out_balanced; 5005 goto out_balanced;
4926 5006
4927 /* 5007 /*
4928 * Don't pull any tasks if this group is already above the domain 5008 * Don't pull any tasks if this group is already above the domain
4929 * average load. 5009 * average load.
4930 */ 5010 */
4931 if (sds.this_load >= sds.avg_load) 5011 if (local->avg_load >= sds.avg_load)
4932 goto out_balanced; 5012 goto out_balanced;
4933 5013
4934 if (env->idle == CPU_IDLE) { 5014 if (env->idle == CPU_IDLE) {
@@ -4938,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4938 * there is no imbalance between this and busiest group 5018 * there is no imbalance between this and busiest group
4939 * wrt to idle cpu's, it is balanced. 5019 * wrt to idle cpu's, it is balanced.
4940 */ 5020 */
4941 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5021 if ((local->idle_cpus < busiest->idle_cpus) &&
4942 sds.busiest_nr_running <= sds.busiest_group_weight) 5022 busiest->sum_nr_running <= busiest->group_weight)
4943 goto out_balanced; 5023 goto out_balanced;
4944 } else { 5024 } else {
4945 /* 5025 /*
4946 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5026 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4947 * imbalance_pct to be conservative. 5027 * imbalance_pct to be conservative.
4948 */ 5028 */
4949 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5029 if (100 * busiest->avg_load <=
5030 env->sd->imbalance_pct * local->avg_load)
4950 goto out_balanced; 5031 goto out_balanced;
4951 } 5032 }
4952 5033
@@ -4956,7 +5037,6 @@ force_balance:
4956 return sds.busiest; 5037 return sds.busiest;
4957 5038
4958out_balanced: 5039out_balanced:
4959ret:
4960 env->imbalance = 0; 5040 env->imbalance = 0;
4961 return NULL; 5041 return NULL;
4962} 5042}
@@ -4968,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4968 struct sched_group *group) 5048 struct sched_group *group)
4969{ 5049{
4970 struct rq *busiest = NULL, *rq; 5050 struct rq *busiest = NULL, *rq;
4971 unsigned long max_load = 0; 5051 unsigned long busiest_load = 0, busiest_power = 1;
4972 int i; 5052 int i;
4973 5053
4974 for_each_cpu(i, sched_group_cpus(group)) { 5054 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4975 unsigned long power = power_of(i); 5055 unsigned long power = power_of(i);
4976 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5056 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4977 SCHED_POWER_SCALE); 5057 SCHED_POWER_SCALE);
@@ -4980,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4980 if (!capacity) 5060 if (!capacity)
4981 capacity = fix_small_capacity(env->sd, group); 5061 capacity = fix_small_capacity(env->sd, group);
4982 5062
4983 if (!cpumask_test_cpu(i, env->cpus))
4984 continue;
4985
4986 rq = cpu_rq(i); 5063 rq = cpu_rq(i);
4987 wl = weighted_cpuload(i); 5064 wl = weighted_cpuload(i);
4988 5065
@@ -4998,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4998 * the weighted_cpuload() scaled with the cpu power, so that 5075 * the weighted_cpuload() scaled with the cpu power, so that
4999 * the load can be moved away from the cpu that is potentially 5076 * the load can be moved away from the cpu that is potentially
5000 * running at a lower capacity. 5077 * running at a lower capacity.
5078 *
5079 * Thus we're looking for max(wl_i / power_i), crosswise
5080 * multiplication to rid ourselves of the division works out
5081 * to: wl_i * power_j > wl_j * power_i; where j is our
5082 * previous maximum.
5001 */ 5083 */
5002 wl = (wl * SCHED_POWER_SCALE) / power; 5084 if (wl * busiest_power > busiest_load * power) {
5003 5085 busiest_load = wl;
5004 if (wl > max_load) { 5086 busiest_power = power;
5005 max_load = wl;
5006 busiest = rq; 5087 busiest = rq;
5007 } 5088 }
5008 } 5089 }
@@ -5039,13 +5120,47 @@ static int need_active_balance(struct lb_env *env)
5039 5120
5040static int active_load_balance_cpu_stop(void *data); 5121static int active_load_balance_cpu_stop(void *data);
5041 5122
5123static int should_we_balance(struct lb_env *env)
5124{
5125 struct sched_group *sg = env->sd->groups;
5126 struct cpumask *sg_cpus, *sg_mask;
5127 int cpu, balance_cpu = -1;
5128
5129 /*
5130 * In the newly idle case, we will allow all the cpu's
5131 * to do the newly idle load balance.
5132 */
5133 if (env->idle == CPU_NEWLY_IDLE)
5134 return 1;
5135
5136 sg_cpus = sched_group_cpus(sg);
5137 sg_mask = sched_group_mask(sg);
5138 /* Try to find first idle cpu */
5139 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5140 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5141 continue;
5142
5143 balance_cpu = cpu;
5144 break;
5145 }
5146
5147 if (balance_cpu == -1)
5148 balance_cpu = group_balance_cpu(sg);
5149
5150 /*
5151 * First idle cpu or the first cpu(busiest) in this sched group
5152 * is eligible for doing load balancing at this and above domains.
5153 */
5154 return balance_cpu != env->dst_cpu;
5155}
5156
5042/* 5157/*
5043 * Check this_cpu to ensure it is balanced within domain. Attempt to move 5158 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5044 * tasks if there is an imbalance. 5159 * tasks if there is an imbalance.
5045 */ 5160 */
5046static int load_balance(int this_cpu, struct rq *this_rq, 5161static int load_balance(int this_cpu, struct rq *this_rq,
5047 struct sched_domain *sd, enum cpu_idle_type idle, 5162 struct sched_domain *sd, enum cpu_idle_type idle,
5048 int *balance) 5163 int *continue_balancing)
5049{ 5164{
5050 int ld_moved, cur_ld_moved, active_balance = 0; 5165 int ld_moved, cur_ld_moved, active_balance = 0;
5051 struct sched_group *group; 5166 struct sched_group *group;
@@ -5075,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5075 schedstat_inc(sd, lb_count[idle]); 5190 schedstat_inc(sd, lb_count[idle]);
5076 5191
5077redo: 5192redo:
5078 group = find_busiest_group(&env, balance); 5193 if (!should_we_balance(&env)) {
5079 5194 *continue_balancing = 0;
5080 if (*balance == 0)
5081 goto out_balanced; 5195 goto out_balanced;
5196 }
5082 5197
5198 group = find_busiest_group(&env);
5083 if (!group) { 5199 if (!group) {
5084 schedstat_inc(sd, lb_nobusyg[idle]); 5200 schedstat_inc(sd, lb_nobusyg[idle]);
5085 goto out_balanced; 5201 goto out_balanced;
@@ -5108,7 +5224,6 @@ redo:
5108 env.src_rq = busiest; 5224 env.src_rq = busiest;
5109 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 5225 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5110 5226
5111 update_h_load(env.src_cpu);
5112more_balance: 5227more_balance:
5113 local_irq_save(flags); 5228 local_irq_save(flags);
5114 double_rq_lock(env.dst_rq, busiest); 5229 double_rq_lock(env.dst_rq, busiest);
@@ -5292,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5292 rcu_read_lock(); 5407 rcu_read_lock();
5293 for_each_domain(this_cpu, sd) { 5408 for_each_domain(this_cpu, sd) {
5294 unsigned long interval; 5409 unsigned long interval;
5295 int balance = 1; 5410 int continue_balancing = 1;
5296 5411
5297 if (!(sd->flags & SD_LOAD_BALANCE)) 5412 if (!(sd->flags & SD_LOAD_BALANCE))
5298 continue; 5413 continue;
@@ -5300,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5300 if (sd->flags & SD_BALANCE_NEWIDLE) { 5415 if (sd->flags & SD_BALANCE_NEWIDLE) {
5301 /* If we've pulled tasks over stop searching: */ 5416 /* If we've pulled tasks over stop searching: */
5302 pulled_task = load_balance(this_cpu, this_rq, 5417 pulled_task = load_balance(this_cpu, this_rq,
5303 sd, CPU_NEWLY_IDLE, &balance); 5418 sd, CPU_NEWLY_IDLE,
5419 &continue_balancing);
5304 } 5420 }
5305 5421
5306 interval = msecs_to_jiffies(sd->balance_interval); 5422 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5506,7 +5622,7 @@ void nohz_balance_enter_idle(int cpu)
5506 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5622 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5507} 5623}
5508 5624
5509static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 5625static int sched_ilb_notifier(struct notifier_block *nfb,
5510 unsigned long action, void *hcpu) 5626 unsigned long action, void *hcpu)
5511{ 5627{
5512 switch (action & ~CPU_TASKS_FROZEN) { 5628 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5538,7 +5654,7 @@ void update_max_interval(void)
5538 */ 5654 */
5539static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5655static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5540{ 5656{
5541 int balance = 1; 5657 int continue_balancing = 1;
5542 struct rq *rq = cpu_rq(cpu); 5658 struct rq *rq = cpu_rq(cpu);
5543 unsigned long interval; 5659 unsigned long interval;
5544 struct sched_domain *sd; 5660 struct sched_domain *sd;
@@ -5570,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5570 } 5686 }
5571 5687
5572 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5688 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5573 if (load_balance(cpu, rq, sd, idle, &balance)) { 5689 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5574 /* 5690 /*
5575 * The LBF_SOME_PINNED logic could have changed 5691 * The LBF_SOME_PINNED logic could have changed
5576 * env->dst_cpu, so we can't know our idle 5692 * env->dst_cpu, so we can't know our idle
@@ -5593,7 +5709,7 @@ out:
5593 * CPU in our sched group which is doing load balancing more 5709 * CPU in our sched group which is doing load balancing more
5594 * actively. 5710 * actively.
5595 */ 5711 */
5596 if (!balance) 5712 if (!continue_balancing)
5597 break; 5713 break;
5598 } 5714 }
5599 rcu_read_unlock(); 5715 rcu_read_unlock();
@@ -5786,7 +5902,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5786 entity_tick(cfs_rq, se, queued); 5902 entity_tick(cfs_rq, se, queued);
5787 } 5903 }
5788 5904
5789 if (sched_feat_numa(NUMA)) 5905 if (numabalancing_enabled)
5790 task_tick_numa(rq, curr); 5906 task_tick_numa(rq, curr);
5791 5907
5792 update_rq_runnable_avg(rq, 1); 5908 update_rq_runnable_avg(rq, 1);
@@ -5889,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5889 * and ensure we don't carry in an old decay_count if we 6005 * and ensure we don't carry in an old decay_count if we
5890 * switch back. 6006 * switch back.
5891 */ 6007 */
5892 if (p->se.avg.decay_count) { 6008 if (se->avg.decay_count) {
5893 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 6009 __synchronize_entity_decay(se);
5894 __synchronize_entity_decay(&p->se); 6010 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5895 subtract_blocked_load_contrib(cfs_rq,
5896 p->se.avg.load_avg_contrib);
5897 } 6011 }
5898#endif 6012#endif
5899} 6013}