aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-04 11:36:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-04 11:36:35 -0400
commit5e0b3a4e88012d259e8b2c0f02f393c79686daf9 (patch)
tree1c6d7be145a7cce77996049eb78877ed95e87a4f /kernel
parent0d99b7087324978b09b59d8c7a0736214c4a42b1 (diff)
parent10866e62e8a6907d9072f10f9a0561db0c0cf50b (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "Various optimizations, cleanups and smaller fixes - no major changes in scheduler behavior" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix the sd_parent_degenerate() code sched/fair: Rework and comment the group_imb code sched/fair: Optimize find_busiest_queue() sched/fair: Make group power more consistent sched/fair: Remove duplicate load_per_task computations sched/fair: Shrink sg_lb_stats and play memset games sched: Clean-up struct sd_lb_stat sched: Factor out code to should_we_balance() sched: Remove one division operation in find_busiest_queue() sched/cputime: Use this_cpu_add() in task_group_account_field() cpumask: Fix cpumask leak in partition_sched_domains() sched/x86: Optimize switch_mm() for multi-threaded workloads generic-ipi: Kill unnecessary variable - csd_flags numa: Mark __node_set() as __always_inline sched/fair: Cleanup: remove duplicate variable declaration sched/__wake_up_sync_key(): Fix nr_exclusive tasks which lead to WF_SYNC clearing
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/fair.c511
-rw-r--r--kernel/smp.c14
4 files changed, 303 insertions, 241 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 57c186d9477e..b8e2162fc803 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2677,7 +2677,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2677 if (unlikely(!q)) 2677 if (unlikely(!q))
2678 return; 2678 return;
2679 2679
2680 if (unlikely(!nr_exclusive)) 2680 if (unlikely(nr_exclusive != 1))
2681 wake_flags = 0; 2681 wake_flags = 0;
2682 2682
2683 spin_lock_irqsave(&q->lock, flags); 2683 spin_lock_irqsave(&q->lock, flags);
@@ -4964,7 +4964,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4964 SD_BALANCE_FORK | 4964 SD_BALANCE_FORK |
4965 SD_BALANCE_EXEC | 4965 SD_BALANCE_EXEC |
4966 SD_SHARE_CPUPOWER | 4966 SD_SHARE_CPUPOWER |
4967 SD_SHARE_PKG_RESOURCES); 4967 SD_SHARE_PKG_RESOURCES |
4968 SD_PREFER_SIBLING);
4968 if (nr_node_ids == 1) 4969 if (nr_node_ids == 1)
4969 pflags &= ~SD_SERIALIZE; 4970 pflags &= ~SD_SERIALIZE;
4970 } 4971 }
@@ -5173,6 +5174,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5173 tmp->parent = parent->parent; 5174 tmp->parent = parent->parent;
5174 if (parent->parent) 5175 if (parent->parent)
5175 parent->parent->child = tmp; 5176 parent->parent->child = tmp;
5177 /*
5178 * Transfer SD_PREFER_SIBLING down in case of a
5179 * degenerate parent; the spans match for this
5180 * so the property transfers.
5181 */
5182 if (parent->flags & SD_PREFER_SIBLING)
5183 tmp->flags |= SD_PREFER_SIBLING;
5176 destroy_sched_domain(parent, cpu); 5184 destroy_sched_domain(parent, cpu);
5177 } else 5185 } else
5178 tmp = tmp->parent; 5186 tmp = tmp->parent;
@@ -6239,8 +6247,9 @@ match1:
6239 ; 6247 ;
6240 } 6248 }
6241 6249
6250 n = ndoms_cur;
6242 if (doms_new == NULL) { 6251 if (doms_new == NULL) {
6243 ndoms_cur = 0; 6252 n = 0;
6244 doms_new = &fallback_doms; 6253 doms_new = &fallback_doms;
6245 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6254 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6246 WARN_ON_ONCE(dattr_new); 6255 WARN_ON_ONCE(dattr_new);
@@ -6248,7 +6257,7 @@ match1:
6248 6257
6249 /* Build new domains */ 6258 /* Build new domains */
6250 for (i = 0; i < ndoms_new; i++) { 6259 for (i = 0; i < ndoms_new; i++) {
6251 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6260 for (j = 0; j < n && !new_topology; j++) {
6252 if (cpumask_equal(doms_new[i], doms_cur[j]) 6261 if (cpumask_equal(doms_new[i], doms_cur[j])
6253 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6262 && dattrs_equal(dattr_new, i, dattr_cur, j))
6254 goto match2; 6263 goto match2;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..e89ccefef278 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
121 * is the only cgroup, then nothing else should be necessary. 121 * is the only cgroup, then nothing else should be necessary.
122 * 122 *
123 */ 123 */
124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
125 125
126 cpuacct_account_field(p, index, tmp); 126 cpuacct_account_field(p, index, tmp);
127} 127}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8977a249816f..7f0a5e6cdae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4277,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p)
4277 4277
4278/********** Helpers for find_busiest_group ************************/ 4278/********** Helpers for find_busiest_group ************************/
4279/* 4279/*
4280 * sd_lb_stats - Structure to store the statistics of a sched_domain
4281 * during load balancing.
4282 */
4283struct sd_lb_stats {
4284 struct sched_group *busiest; /* Busiest group in this sd */
4285 struct sched_group *this; /* Local group in this sd */
4286 unsigned long total_load; /* Total load of all groups in sd */
4287 unsigned long total_pwr; /* Total power of all groups in sd */
4288 unsigned long avg_load; /* Average load across all groups in sd */
4289
4290 /** Statistics of this group */
4291 unsigned long this_load;
4292 unsigned long this_load_per_task;
4293 unsigned long this_nr_running;
4294 unsigned long this_has_capacity;
4295 unsigned int this_idle_cpus;
4296
4297 /* Statistics of the busiest group */
4298 unsigned int busiest_idle_cpus;
4299 unsigned long max_load;
4300 unsigned long busiest_load_per_task;
4301 unsigned long busiest_nr_running;
4302 unsigned long busiest_group_capacity;
4303 unsigned long busiest_has_capacity;
4304 unsigned int busiest_group_weight;
4305
4306 int group_imb; /* Is there imbalance in this sd */
4307};
4308
4309/*
4310 * sg_lb_stats - stats of a sched_group required for load_balancing 4280 * sg_lb_stats - stats of a sched_group required for load_balancing
4311 */ 4281 */
4312struct sg_lb_stats { 4282struct sg_lb_stats {
4313 unsigned long avg_load; /*Avg load across the CPUs of the group */ 4283 unsigned long avg_load; /*Avg load across the CPUs of the group */
4314 unsigned long group_load; /* Total load over the CPUs of the group */ 4284 unsigned long group_load; /* Total load over the CPUs of the group */
4315 unsigned long sum_nr_running; /* Nr tasks running in the group */
4316 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 4285 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4317 unsigned long group_capacity; 4286 unsigned long load_per_task;
4318 unsigned long idle_cpus; 4287 unsigned long group_power;
4319 unsigned long group_weight; 4288 unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 unsigned int group_capacity;
4290 unsigned int idle_cpus;
4291 unsigned int group_weight;
4320 int group_imb; /* Is there an imbalance in the group ? */ 4292 int group_imb; /* Is there an imbalance in the group ? */
4321 int group_has_capacity; /* Is there extra capacity in the group? */ 4293 int group_has_capacity; /* Is there extra capacity in the group? */
4322}; 4294};
4323 4295
4296/*
4297 * sd_lb_stats - Structure to store the statistics of a sched_domain
4298 * during load balancing.
4299 */
4300struct sd_lb_stats {
4301 struct sched_group *busiest; /* Busiest group in this sd */
4302 struct sched_group *local; /* Local group in this sd */
4303 unsigned long total_load; /* Total load of all groups in sd */
4304 unsigned long total_pwr; /* Total power of all groups in sd */
4305 unsigned long avg_load; /* Average load across all groups in sd */
4306
4307 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 struct sg_lb_stats local_stat; /* Statistics of the local group */
4309};
4310
4311static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312{
4313 /*
4314 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 * We must however clear busiest_stat::avg_load because
4317 * update_sd_pick_busiest() reads this before assignment.
4318 */
4319 *sds = (struct sd_lb_stats){
4320 .busiest = NULL,
4321 .local = NULL,
4322 .total_load = 0UL,
4323 .total_pwr = 0UL,
4324 .busiest_stat = {
4325 .avg_load = 0UL,
4326 },
4327 };
4328}
4329
4324/** 4330/**
4325 * get_sd_load_idx - Obtain the load index for a given sched domain. 4331 * get_sd_load_idx - Obtain the load index for a given sched domain.
4326 * @sd: The sched_domain whose load_idx is to be obtained. 4332 * @sd: The sched_domain whose load_idx is to be obtained.
@@ -4504,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4504 return 0; 4510 return 0;
4505} 4511}
4506 4512
4513/*
4514 * Group imbalance indicates (and tries to solve) the problem where balancing
4515 * groups is inadequate due to tsk_cpus_allowed() constraints.
4516 *
4517 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519 * Something like:
4520 *
4521 * { 0 1 2 3 } { 4 5 6 7 }
4522 * * * * *
4523 *
4524 * If we were to balance group-wise we'd place two tasks in the first group and
4525 * two tasks in the second group. Clearly this is undesired as it will overload
4526 * cpu 3 and leave one of the cpus in the second group unused.
4527 *
4528 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see
4531 * sg_imbalanced().
4532 *
4533 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it
4536 * to create an effective group imbalance.
4537 *
4538 * This is a somewhat tricky proposition since the next run might not find the
4539 * group imbalance and decide the groups need to be balanced again. A most
4540 * subtle and fragile situation.
4541 */
4542
4543struct sg_imb_stats {
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552}
4553
4554static inline void
4555update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 unsigned long load, unsigned long nr_running)
4557{
4558 if (load > sgi->max_cpu_load)
4559 sgi->max_cpu_load = load;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562
4563 if (nr_running > sgi->max_nr_running)
4564 sgi->max_nr_running = nr_running;
4565 if (sgi->min_nr_running > nr_running)
4566 sgi->min_nr_running = nr_running;
4567}
4568
4569static inline int
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571{
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584
4585 return 0;
4586}
4587
4507/** 4588/**
4508 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4589 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4509 * @env: The load balancing environment. 4590 * @env: The load balancing environment.
4510 * @group: sched_group whose statistics are to be updated. 4591 * @group: sched_group whose statistics are to be updated.
4511 * @load_idx: Load index of sched_domain of this_cpu for load calc. 4592 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4512 * @local_group: Does group contain this_cpu. 4593 * @local_group: Does group contain this_cpu.
4513 * @balance: Should we balance.
4514 * @sgs: variable to hold the statistics for this group. 4594 * @sgs: variable to hold the statistics for this group.
4515 */ 4595 */
4516static inline void update_sg_lb_stats(struct lb_env *env, 4596static inline void update_sg_lb_stats(struct lb_env *env,
4517 struct sched_group *group, int load_idx, 4597 struct sched_group *group, int load_idx,
4518 int local_group, int *balance, struct sg_lb_stats *sgs) 4598 int local_group, struct sg_lb_stats *sgs)
4519{ 4599{
4520 unsigned long nr_running, max_nr_running, min_nr_running; 4600 struct sg_imb_stats sgi;
4521 unsigned long load, max_cpu_load, min_cpu_load; 4601 unsigned long nr_running;
4522 unsigned int balance_cpu = -1, first_idle_cpu = 0; 4602 unsigned long load;
4523 unsigned long avg_load_per_task = 0;
4524 int i; 4603 int i;
4525 4604
4526 if (local_group) 4605 init_sg_imb_stats(&sgi);
4527 balance_cpu = group_balance_cpu(group);
4528
4529 /* Tally up the load of all CPUs in the group */
4530 max_cpu_load = 0;
4531 min_cpu_load = ~0UL;
4532 max_nr_running = 0;
4533 min_nr_running = ~0UL;
4534 4606
4535 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4536 struct rq *rq = cpu_rq(i); 4608 struct rq *rq = cpu_rq(i);
@@ -4539,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4539 4611
4540 /* Bias balancing toward cpus of our domain */ 4612 /* Bias balancing toward cpus of our domain */
4541 if (local_group) { 4613 if (local_group) {
4542 if (idle_cpu(i) && !first_idle_cpu &&
4543 cpumask_test_cpu(i, sched_group_mask(group))) {
4544 first_idle_cpu = 1;
4545 balance_cpu = i;
4546 }
4547
4548 load = target_load(i, load_idx); 4614 load = target_load(i, load_idx);
4549 } else { 4615 } else {
4550 load = source_load(i, load_idx); 4616 load = source_load(i, load_idx);
4551 if (load > max_cpu_load) 4617 update_sg_imb_stats(&sgi, load, nr_running);
4552 max_cpu_load = load;
4553 if (min_cpu_load > load)
4554 min_cpu_load = load;
4555
4556 if (nr_running > max_nr_running)
4557 max_nr_running = nr_running;
4558 if (min_nr_running > nr_running)
4559 min_nr_running = nr_running;
4560 } 4618 }
4561 4619
4562 sgs->group_load += load; 4620 sgs->group_load += load;
@@ -4566,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4566 sgs->idle_cpus++; 4624 sgs->idle_cpus++;
4567 } 4625 }
4568 4626
4569 /* 4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4570 * First idle cpu or the first cpu(busiest) in this sched group 4628 time_after_eq(jiffies, group->sgp->next_update)))
4571 * is eligible for doing load balancing at this and above 4629 update_group_power(env->sd, env->dst_cpu);
4572 * domains. In the newly idle case, we will allow all the cpu's
4573 * to do the newly idle load balance.
4574 */
4575 if (local_group) {
4576 if (env->idle != CPU_NEWLY_IDLE) {
4577 if (balance_cpu != env->dst_cpu) {
4578 *balance = 0;
4579 return;
4580 }
4581 update_group_power(env->sd, env->dst_cpu);
4582 } else if (time_after_eq(jiffies, group->sgp->next_update))
4583 update_group_power(env->sd, env->dst_cpu);
4584 }
4585 4630
4586 /* Adjust by relative CPU power of the group */ 4631 /* Adjust by relative CPU power of the group */
4587 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 4632 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4588 4634
4589 /*
4590 * Consider the group unbalanced when the imbalance is larger
4591 * than the average weight of a task.
4592 *
4593 * APZ: with cgroup the avg task weight can vary wildly and
4594 * might not be a suitable number - should we keep a
4595 * normalized nr_running number somewhere that negates
4596 * the hierarchy?
4597 */
4598 if (sgs->sum_nr_running) 4635 if (sgs->sum_nr_running)
4599 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4600 4637
4601 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && 4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4602 (max_nr_running - min_nr_running) > 1) 4639
4603 sgs->group_imb = 1; 4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4604 4642
4605 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4606 SCHED_POWER_SCALE);
4607 if (!sgs->group_capacity) 4643 if (!sgs->group_capacity)
4608 sgs->group_capacity = fix_small_capacity(env->sd, group); 4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4609 sgs->group_weight = group->group_weight; 4646 sgs->group_weight = group->group_weight;
4610 4647
4611 if (sgs->group_capacity > sgs->sum_nr_running) 4648 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4630,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4630 struct sched_group *sg, 4667 struct sched_group *sg,
4631 struct sg_lb_stats *sgs) 4668 struct sg_lb_stats *sgs)
4632{ 4669{
4633 if (sgs->avg_load <= sds->max_load) 4670 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4634 return false; 4671 return false;
4635 4672
4636 if (sgs->sum_nr_running > sgs->group_capacity) 4673 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4663,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4663 * @sds: variable to hold the statistics for this sched_domain. 4700 * @sds: variable to hold the statistics for this sched_domain.
4664 */ 4701 */
4665static inline void update_sd_lb_stats(struct lb_env *env, 4702static inline void update_sd_lb_stats(struct lb_env *env,
4666 int *balance, struct sd_lb_stats *sds) 4703 struct sd_lb_stats *sds)
4667{ 4704{
4668 struct sched_domain *child = env->sd->child; 4705 struct sched_domain *child = env->sd->child;
4669 struct sched_group *sg = env->sd->groups; 4706 struct sched_group *sg = env->sd->groups;
4670 struct sg_lb_stats sgs; 4707 struct sg_lb_stats tmp_sgs;
4671 int load_idx, prefer_sibling = 0; 4708 int load_idx, prefer_sibling = 0;
4672 4709
4673 if (child && child->flags & SD_PREFER_SIBLING) 4710 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4676,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4676 load_idx = get_sd_load_idx(env->sd, env->idle); 4713 load_idx = get_sd_load_idx(env->sd, env->idle);
4677 4714
4678 do { 4715 do {
4716 struct sg_lb_stats *sgs = &tmp_sgs;
4679 int local_group; 4717 int local_group;
4680 4718
4681 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 4719 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4682 memset(&sgs, 0, sizeof(sgs)); 4720 if (local_group) {
4683 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 4721 sds->local = sg;
4684 4722 sgs = &sds->local_stat;
4685 if (local_group && !(*balance)) 4723 }
4686 return;
4687 4724
4688 sds->total_load += sgs.group_load; 4725 memset(sgs, 0, sizeof(*sgs));
4689 sds->total_pwr += sg->sgp->power; 4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4690 4727
4691 /* 4728 /*
4692 * In case the child domain prefers tasks go to siblings 4729 * In case the child domain prefers tasks go to siblings
@@ -4698,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4698 * heaviest group when it is already under-utilized (possible 4735 * heaviest group when it is already under-utilized (possible
4699 * with a large weight task outweighs the tasks on the system). 4736 * with a large weight task outweighs the tasks on the system).
4700 */ 4737 */
4701 if (prefer_sibling && !local_group && sds->this_has_capacity) 4738 if (prefer_sibling && !local_group &&
4702 sgs.group_capacity = min(sgs.group_capacity, 1UL); 4739 sds->local && sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U);
4703 4741
4704 if (local_group) { 4742 /* Now, start updating sd_lb_stats */
4705 sds->this_load = sgs.avg_load; 4743 sds->total_load += sgs->group_load;
4706 sds->this = sg; 4744 sds->total_pwr += sgs->group_power;
4707 sds->this_nr_running = sgs.sum_nr_running; 4745
4708 sds->this_load_per_task = sgs.sum_weighted_load; 4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4709 sds->this_has_capacity = sgs.group_has_capacity;
4710 sds->this_idle_cpus = sgs.idle_cpus;
4711 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4712 sds->max_load = sgs.avg_load;
4713 sds->busiest = sg; 4747 sds->busiest = sg;
4714 sds->busiest_nr_running = sgs.sum_nr_running; 4748 sds->busiest_stat = *sgs;
4715 sds->busiest_idle_cpus = sgs.idle_cpus;
4716 sds->busiest_group_capacity = sgs.group_capacity;
4717 sds->busiest_load_per_task = sgs.sum_weighted_load;
4718 sds->busiest_has_capacity = sgs.group_has_capacity;
4719 sds->busiest_group_weight = sgs.group_weight;
4720 sds->group_imb = sgs.group_imb;
4721 } 4749 }
4722 4750
4723 sg = sg->next; 4751 sg = sg->next;
@@ -4762,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4762 return 0; 4790 return 0;
4763 4791
4764 env->imbalance = DIV_ROUND_CLOSEST( 4792 env->imbalance = DIV_ROUND_CLOSEST(
4765 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 4793 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 SCHED_POWER_SCALE);
4766 4795
4767 return 1; 4796 return 1;
4768} 4797}
@@ -4780,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4780 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4809 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4781 unsigned int imbn = 2; 4810 unsigned int imbn = 2;
4782 unsigned long scaled_busy_load_per_task; 4811 unsigned long scaled_busy_load_per_task;
4812 struct sg_lb_stats *local, *busiest;
4783 4813
4784 if (sds->this_nr_running) { 4814 local = &sds->local_stat;
4785 sds->this_load_per_task /= sds->this_nr_running; 4815 busiest = &sds->busiest_stat;
4786 if (sds->busiest_load_per_task >
4787 sds->this_load_per_task)
4788 imbn = 1;
4789 } else {
4790 sds->this_load_per_task =
4791 cpu_avg_load_per_task(env->dst_cpu);
4792 }
4793 4816
4794 scaled_busy_load_per_task = sds->busiest_load_per_task 4817 if (!local->sum_nr_running)
4795 * SCHED_POWER_SCALE; 4818 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4796 scaled_busy_load_per_task /= sds->busiest->sgp->power; 4819 else if (busiest->load_per_task > local->load_per_task)
4820 imbn = 1;
4797 4821
4798 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4822 scaled_busy_load_per_task =
4799 (scaled_busy_load_per_task * imbn)) { 4823 (busiest->load_per_task * SCHED_POWER_SCALE) /
4800 env->imbalance = sds->busiest_load_per_task; 4824 busiest->group_power;
4825
4826 if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
4827 (scaled_busy_load_per_task * imbn)) {
4828 env->imbalance = busiest->load_per_task;
4801 return; 4829 return;
4802 } 4830 }
4803 4831
@@ -4807,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4807 * moving them. 4835 * moving them.
4808 */ 4836 */
4809 4837
4810 pwr_now += sds->busiest->sgp->power * 4838 pwr_now += busiest->group_power *
4811 min(sds->busiest_load_per_task, sds->max_load); 4839 min(busiest->load_per_task, busiest->avg_load);
4812 pwr_now += sds->this->sgp->power * 4840 pwr_now += local->group_power *
4813 min(sds->this_load_per_task, sds->this_load); 4841 min(local->load_per_task, local->avg_load);
4814 pwr_now /= SCHED_POWER_SCALE; 4842 pwr_now /= SCHED_POWER_SCALE;
4815 4843
4816 /* Amount of load we'd subtract */ 4844 /* Amount of load we'd subtract */
4817 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4845 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4818 sds->busiest->sgp->power; 4846 busiest->group_power;
4819 if (sds->max_load > tmp) 4847 if (busiest->avg_load > tmp) {
4820 pwr_move += sds->busiest->sgp->power * 4848 pwr_move += busiest->group_power *
4821 min(sds->busiest_load_per_task, sds->max_load - tmp); 4849 min(busiest->load_per_task,
4850 busiest->avg_load - tmp);
4851 }
4822 4852
4823 /* Amount of load we'd add */ 4853 /* Amount of load we'd add */
4824 if (sds->max_load * sds->busiest->sgp->power < 4854 if (busiest->avg_load * busiest->group_power <
4825 sds->busiest_load_per_task * SCHED_POWER_SCALE) 4855 busiest->load_per_task * SCHED_POWER_SCALE) {
4826 tmp = (sds->max_load * sds->busiest->sgp->power) / 4856 tmp = (busiest->avg_load * busiest->group_power) /
4827 sds->this->sgp->power; 4857 local->group_power;
4828 else 4858 } else {
4829 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4859 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4830 sds->this->sgp->power; 4860 local->group_power;
4831 pwr_move += sds->this->sgp->power * 4861 }
4832 min(sds->this_load_per_task, sds->this_load + tmp); 4862 pwr_move += local->group_power *
4863 min(local->load_per_task, local->avg_load + tmp);
4833 pwr_move /= SCHED_POWER_SCALE; 4864 pwr_move /= SCHED_POWER_SCALE;
4834 4865
4835 /* Move if we gain throughput */ 4866 /* Move if we gain throughput */
4836 if (pwr_move > pwr_now) 4867 if (pwr_move > pwr_now)
4837 env->imbalance = sds->busiest_load_per_task; 4868 env->imbalance = busiest->load_per_task;
4838} 4869}
4839 4870
4840/** 4871/**
@@ -4846,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4846static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 4877static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4847{ 4878{
4848 unsigned long max_pull, load_above_capacity = ~0UL; 4879 unsigned long max_pull, load_above_capacity = ~0UL;
4880 struct sg_lb_stats *local, *busiest;
4881
4882 local = &sds->local_stat;
4883 busiest = &sds->busiest_stat;
4849 4884
4850 sds->busiest_load_per_task /= sds->busiest_nr_running; 4885 if (busiest->group_imb) {
4851 if (sds->group_imb) { 4886 /*
4852 sds->busiest_load_per_task = 4887 * In the group_imb case we cannot rely on group-wide averages
4853 min(sds->busiest_load_per_task, sds->avg_load); 4888 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 */
4890 busiest->load_per_task =
4891 min(busiest->load_per_task, sds->avg_load);
4854 } 4892 }
4855 4893
4856 /* 4894 /*
@@ -4858,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4858 * max load less than avg load(as we skip the groups at or below 4896 * max load less than avg load(as we skip the groups at or below
4859 * its cpu_power, while calculating max_load..) 4897 * its cpu_power, while calculating max_load..)
4860 */ 4898 */
4861 if (sds->max_load < sds->avg_load) { 4899 if (busiest->avg_load < sds->avg_load) {
4862 env->imbalance = 0; 4900 env->imbalance = 0;
4863 return fix_small_imbalance(env, sds); 4901 return fix_small_imbalance(env, sds);
4864 } 4902 }
4865 4903
4866 if (!sds->group_imb) { 4904 if (!busiest->group_imb) {
4867 /* 4905 /*
4868 * Don't want to pull so many tasks that a group would go idle. 4906 * Don't want to pull so many tasks that a group would go idle.
4907 * Except of course for the group_imb case, since then we might
4908 * have to drop below capacity to reach cpu-load equilibrium.
4869 */ 4909 */
4870 load_above_capacity = (sds->busiest_nr_running - 4910 load_above_capacity =
4871 sds->busiest_group_capacity); 4911 (busiest->sum_nr_running - busiest->group_capacity);
4872 4912
4873 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 4913 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4874 4914 load_above_capacity /= busiest->group_power;
4875 load_above_capacity /= sds->busiest->sgp->power;
4876 } 4915 }
4877 4916
4878 /* 4917 /*
@@ -4882,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4882 * we also don't want to reduce the group load below the group capacity 4921 * we also don't want to reduce the group load below the group capacity
4883 * (so that we can implement power-savings policies etc). Thus we look 4922 * (so that we can implement power-savings policies etc). Thus we look
4884 * for the minimum possible imbalance. 4923 * for the minimum possible imbalance.
4885 * Be careful of negative numbers as they'll appear as very large values
4886 * with unsigned longs.
4887 */ 4924 */
4888 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4925 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4889 4926
4890 /* How much load to actually move to equalise the imbalance */ 4927 /* How much load to actually move to equalise the imbalance */
4891 env->imbalance = min(max_pull * sds->busiest->sgp->power, 4928 env->imbalance = min(
4892 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4929 max_pull * busiest->group_power,
4893 / SCHED_POWER_SCALE; 4930 (sds->avg_load - local->avg_load) * local->group_power
4931 ) / SCHED_POWER_SCALE;
4894 4932
4895 /* 4933 /*
4896 * if *imbalance is less than the average load per runnable task 4934 * if *imbalance is less than the average load per runnable task
@@ -4898,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4898 * a think about bumping its value to force at least one task to be 4936 * a think about bumping its value to force at least one task to be
4899 * moved 4937 * moved
4900 */ 4938 */
4901 if (env->imbalance < sds->busiest_load_per_task) 4939 if (env->imbalance < busiest->load_per_task)
4902 return fix_small_imbalance(env, sds); 4940 return fix_small_imbalance(env, sds);
4903
4904} 4941}
4905 4942
4906/******* find_busiest_group() helpers end here *********************/ 4943/******* find_busiest_group() helpers end here *********************/
@@ -4916,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4916 * to restore balance. 4953 * to restore balance.
4917 * 4954 *
4918 * @env: The load balancing environment. 4955 * @env: The load balancing environment.
4919 * @balance: Pointer to a variable indicating if this_cpu
4920 * is the appropriate cpu to perform load balancing at this_level.
4921 * 4956 *
4922 * Return: - The busiest group if imbalance exists. 4957 * Return: - The busiest group if imbalance exists.
4923 * - If no imbalance and user has opted for power-savings balance, 4958 * - If no imbalance and user has opted for power-savings balance,
4924 * return the least loaded group whose CPUs can be 4959 * return the least loaded group whose CPUs can be
4925 * put to idle by rebalancing its tasks onto our group. 4960 * put to idle by rebalancing its tasks onto our group.
4926 */ 4961 */
4927static struct sched_group * 4962static struct sched_group *find_busiest_group(struct lb_env *env)
4928find_busiest_group(struct lb_env *env, int *balance)
4929{ 4963{
4964 struct sg_lb_stats *local, *busiest;
4930 struct sd_lb_stats sds; 4965 struct sd_lb_stats sds;
4931 4966
4932 memset(&sds, 0, sizeof(sds)); 4967 init_sd_lb_stats(&sds);
4933 4968
4934 /* 4969 /*
4935 * Compute the various statistics relavent for load balancing at 4970 * Compute the various statistics relavent for load balancing at
4936 * this level. 4971 * this level.
4937 */ 4972 */
4938 update_sd_lb_stats(env, balance, &sds); 4973 update_sd_lb_stats(env, &sds);
4939 4974 local = &sds.local_stat;
4940 /* 4975 busiest = &sds.busiest_stat;
4941 * this_cpu is not the appropriate cpu to perform load balancing at
4942 * this level.
4943 */
4944 if (!(*balance))
4945 goto ret;
4946 4976
4947 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 4977 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4948 check_asym_packing(env, &sds)) 4978 check_asym_packing(env, &sds))
4949 return sds.busiest; 4979 return sds.busiest;
4950 4980
4951 /* There is no busy sibling group to pull tasks from */ 4981 /* There is no busy sibling group to pull tasks from */
4952 if (!sds.busiest || sds.busiest_nr_running == 0) 4982 if (!sds.busiest || busiest->sum_nr_running == 0)
4953 goto out_balanced; 4983 goto out_balanced;
4954 4984
4955 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 4985 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4956 4986
4957 /* 4987 /*
4958 * If the busiest group is imbalanced the below checks don't 4988 * If the busiest group is imbalanced the below checks don't
4959 * work because they assumes all things are equal, which typically 4989 * work because they assume all things are equal, which typically
4960 * isn't true due to cpus_allowed constraints and the like. 4990 * isn't true due to cpus_allowed constraints and the like.
4961 */ 4991 */
4962 if (sds.group_imb) 4992 if (busiest->group_imb)
4963 goto force_balance; 4993 goto force_balance;
4964 4994
4965 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4995 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4966 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4996 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4967 !sds.busiest_has_capacity) 4997 !busiest->group_has_capacity)
4968 goto force_balance; 4998 goto force_balance;
4969 4999
4970 /* 5000 /*
4971 * If the local group is more busy than the selected busiest group 5001 * If the local group is more busy than the selected busiest group
4972 * don't try and pull any tasks. 5002 * don't try and pull any tasks.
4973 */ 5003 */
4974 if (sds.this_load >= sds.max_load) 5004 if (local->avg_load >= busiest->avg_load)
4975 goto out_balanced; 5005 goto out_balanced;
4976 5006
4977 /* 5007 /*
4978 * Don't pull any tasks if this group is already above the domain 5008 * Don't pull any tasks if this group is already above the domain
4979 * average load. 5009 * average load.
4980 */ 5010 */
4981 if (sds.this_load >= sds.avg_load) 5011 if (local->avg_load >= sds.avg_load)
4982 goto out_balanced; 5012 goto out_balanced;
4983 5013
4984 if (env->idle == CPU_IDLE) { 5014 if (env->idle == CPU_IDLE) {
@@ -4988,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4988 * there is no imbalance between this and busiest group 5018 * there is no imbalance between this and busiest group
4989 * wrt to idle cpu's, it is balanced. 5019 * wrt to idle cpu's, it is balanced.
4990 */ 5020 */
4991 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5021 if ((local->idle_cpus < busiest->idle_cpus) &&
4992 sds.busiest_nr_running <= sds.busiest_group_weight) 5022 busiest->sum_nr_running <= busiest->group_weight)
4993 goto out_balanced; 5023 goto out_balanced;
4994 } else { 5024 } else {
4995 /* 5025 /*
4996 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5026 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4997 * imbalance_pct to be conservative. 5027 * imbalance_pct to be conservative.
4998 */ 5028 */
4999 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5029 if (100 * busiest->avg_load <=
5030 env->sd->imbalance_pct * local->avg_load)
5000 goto out_balanced; 5031 goto out_balanced;
5001 } 5032 }
5002 5033
@@ -5006,7 +5037,6 @@ force_balance:
5006 return sds.busiest; 5037 return sds.busiest;
5007 5038
5008out_balanced: 5039out_balanced:
5009ret:
5010 env->imbalance = 0; 5040 env->imbalance = 0;
5011 return NULL; 5041 return NULL;
5012} 5042}
@@ -5018,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5018 struct sched_group *group) 5048 struct sched_group *group)
5019{ 5049{
5020 struct rq *busiest = NULL, *rq; 5050 struct rq *busiest = NULL, *rq;
5021 unsigned long max_load = 0; 5051 unsigned long busiest_load = 0, busiest_power = 1;
5022 int i; 5052 int i;
5023 5053
5024 for_each_cpu(i, sched_group_cpus(group)) { 5054 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5025 unsigned long power = power_of(i); 5055 unsigned long power = power_of(i);
5026 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5056 unsigned long capacity = DIV_ROUND_CLOSEST(power,
5027 SCHED_POWER_SCALE); 5057 SCHED_POWER_SCALE);
@@ -5030,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5030 if (!capacity) 5060 if (!capacity)
5031 capacity = fix_small_capacity(env->sd, group); 5061 capacity = fix_small_capacity(env->sd, group);
5032 5062
5033 if (!cpumask_test_cpu(i, env->cpus))
5034 continue;
5035
5036 rq = cpu_rq(i); 5063 rq = cpu_rq(i);
5037 wl = weighted_cpuload(i); 5064 wl = weighted_cpuload(i);
5038 5065
@@ -5048,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5048 * the weighted_cpuload() scaled with the cpu power, so that 5075 * the weighted_cpuload() scaled with the cpu power, so that
5049 * the load can be moved away from the cpu that is potentially 5076 * the load can be moved away from the cpu that is potentially
5050 * running at a lower capacity. 5077 * running at a lower capacity.
5078 *
5079 * Thus we're looking for max(wl_i / power_i), crosswise
5080 * multiplication to rid ourselves of the division works out
5081 * to: wl_i * power_j > wl_j * power_i; where j is our
5082 * previous maximum.
5051 */ 5083 */
5052 wl = (wl * SCHED_POWER_SCALE) / power; 5084 if (wl * busiest_power > busiest_load * power) {
5053 5085 busiest_load = wl;
5054 if (wl > max_load) { 5086 busiest_power = power;
5055 max_load = wl;
5056 busiest = rq; 5087 busiest = rq;
5057 } 5088 }
5058 } 5089 }
@@ -5089,13 +5120,47 @@ static int need_active_balance(struct lb_env *env)
5089 5120
5090static int active_load_balance_cpu_stop(void *data); 5121static int active_load_balance_cpu_stop(void *data);
5091 5122
5123static int should_we_balance(struct lb_env *env)
5124{
5125 struct sched_group *sg = env->sd->groups;
5126 struct cpumask *sg_cpus, *sg_mask;
5127 int cpu, balance_cpu = -1;
5128
5129 /*
5130 * In the newly idle case, we will allow all the cpu's
5131 * to do the newly idle load balance.
5132 */
5133 if (env->idle == CPU_NEWLY_IDLE)
5134 return 1;
5135
5136 sg_cpus = sched_group_cpus(sg);
5137 sg_mask = sched_group_mask(sg);
5138 /* Try to find first idle cpu */
5139 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5140 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5141 continue;
5142
5143 balance_cpu = cpu;
5144 break;
5145 }
5146
5147 if (balance_cpu == -1)
5148 balance_cpu = group_balance_cpu(sg);
5149
5150 /*
5151 * First idle cpu or the first cpu(busiest) in this sched group
5152 * is eligible for doing load balancing at this and above domains.
5153 */
5154 return balance_cpu != env->dst_cpu;
5155}
5156
5092/* 5157/*
5093 * Check this_cpu to ensure it is balanced within domain. Attempt to move 5158 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5094 * tasks if there is an imbalance. 5159 * tasks if there is an imbalance.
5095 */ 5160 */
5096static int load_balance(int this_cpu, struct rq *this_rq, 5161static int load_balance(int this_cpu, struct rq *this_rq,
5097 struct sched_domain *sd, enum cpu_idle_type idle, 5162 struct sched_domain *sd, enum cpu_idle_type idle,
5098 int *balance) 5163 int *continue_balancing)
5099{ 5164{
5100 int ld_moved, cur_ld_moved, active_balance = 0; 5165 int ld_moved, cur_ld_moved, active_balance = 0;
5101 struct sched_group *group; 5166 struct sched_group *group;
@@ -5125,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5125 schedstat_inc(sd, lb_count[idle]); 5190 schedstat_inc(sd, lb_count[idle]);
5126 5191
5127redo: 5192redo:
5128 group = find_busiest_group(&env, balance); 5193 if (!should_we_balance(&env)) {
5129 5194 *continue_balancing = 0;
5130 if (*balance == 0)
5131 goto out_balanced; 5195 goto out_balanced;
5196 }
5132 5197
5198 group = find_busiest_group(&env);
5133 if (!group) { 5199 if (!group) {
5134 schedstat_inc(sd, lb_nobusyg[idle]); 5200 schedstat_inc(sd, lb_nobusyg[idle]);
5135 goto out_balanced; 5201 goto out_balanced;
@@ -5341,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5341 rcu_read_lock(); 5407 rcu_read_lock();
5342 for_each_domain(this_cpu, sd) { 5408 for_each_domain(this_cpu, sd) {
5343 unsigned long interval; 5409 unsigned long interval;
5344 int balance = 1; 5410 int continue_balancing = 1;
5345 5411
5346 if (!(sd->flags & SD_LOAD_BALANCE)) 5412 if (!(sd->flags & SD_LOAD_BALANCE))
5347 continue; 5413 continue;
@@ -5349,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5349 if (sd->flags & SD_BALANCE_NEWIDLE) { 5415 if (sd->flags & SD_BALANCE_NEWIDLE) {
5350 /* If we've pulled tasks over stop searching: */ 5416 /* If we've pulled tasks over stop searching: */
5351 pulled_task = load_balance(this_cpu, this_rq, 5417 pulled_task = load_balance(this_cpu, this_rq,
5352 sd, CPU_NEWLY_IDLE, &balance); 5418 sd, CPU_NEWLY_IDLE,
5419 &continue_balancing);
5353 } 5420 }
5354 5421
5355 interval = msecs_to_jiffies(sd->balance_interval); 5422 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5587,7 +5654,7 @@ void update_max_interval(void)
5587 */ 5654 */
5588static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5655static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5589{ 5656{
5590 int balance = 1; 5657 int continue_balancing = 1;
5591 struct rq *rq = cpu_rq(cpu); 5658 struct rq *rq = cpu_rq(cpu);
5592 unsigned long interval; 5659 unsigned long interval;
5593 struct sched_domain *sd; 5660 struct sched_domain *sd;
@@ -5619,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5619 } 5686 }
5620 5687
5621 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5688 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5622 if (load_balance(cpu, rq, sd, idle, &balance)) { 5689 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5623 /* 5690 /*
5624 * The LBF_SOME_PINNED logic could have changed 5691 * The LBF_SOME_PINNED logic could have changed
5625 * env->dst_cpu, so we can't know our idle 5692 * env->dst_cpu, so we can't know our idle
@@ -5642,7 +5709,7 @@ out:
5642 * CPU in our sched group which is doing load balancing more 5709 * CPU in our sched group which is doing load balancing more
5643 * actively. 5710 * actively.
5644 */ 5711 */
5645 if (!balance) 5712 if (!continue_balancing)
5646 break; 5713 break;
5647 } 5714 }
5648 rcu_read_unlock(); 5715 rcu_read_unlock();
@@ -5938,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5938 * and ensure we don't carry in an old decay_count if we 6005 * and ensure we don't carry in an old decay_count if we
5939 * switch back. 6006 * switch back.
5940 */ 6007 */
5941 if (p->se.avg.decay_count) { 6008 if (se->avg.decay_count) {
5942 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 6009 __synchronize_entity_decay(se);
5943 __synchronize_entity_decay(&p->se); 6010 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5944 subtract_blocked_load_contrib(cfs_rq,
5945 p->se.avg.load_avg_contrib);
5946 } 6011 }
5947#endif 6012#endif
5948} 6013}
diff --git a/kernel/smp.c b/kernel/smp.c
index b1c9034bdfcb..449b707fc20d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void)
186 186
187 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
188 struct call_single_data *csd; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 189
191 csd = list_entry(list.next, struct call_single_data, list); 190 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&csd->list); 191 list_del(&csd->list);
193 192
194 /*
195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()),
197 * so save them away before making the call:
198 */
199 csd_flags = csd->flags;
200
201 csd->func(csd->info); 193 csd->func(csd->info);
202 194
203 /* 195 csd_unlock(csd);
204 * Unlocked CSDs are valid through generic_exec_single():
205 */
206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(csd);
208 } 196 }
209} 197}
210 198