diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-04 11:36:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-04 11:36:35 -0400 |
commit | 5e0b3a4e88012d259e8b2c0f02f393c79686daf9 (patch) | |
tree | 1c6d7be145a7cce77996049eb78877ed95e87a4f /kernel | |
parent | 0d99b7087324978b09b59d8c7a0736214c4a42b1 (diff) | |
parent | 10866e62e8a6907d9072f10f9a0561db0c0cf50b (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"Various optimizations, cleanups and smaller fixes - no major changes
in scheduler behavior"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix the sd_parent_degenerate() code
sched/fair: Rework and comment the group_imb code
sched/fair: Optimize find_busiest_queue()
sched/fair: Make group power more consistent
sched/fair: Remove duplicate load_per_task computations
sched/fair: Shrink sg_lb_stats and play memset games
sched: Clean-up struct sd_lb_stat
sched: Factor out code to should_we_balance()
sched: Remove one division operation in find_busiest_queue()
sched/cputime: Use this_cpu_add() in task_group_account_field()
cpumask: Fix cpumask leak in partition_sched_domains()
sched/x86: Optimize switch_mm() for multi-threaded workloads
generic-ipi: Kill unnecessary variable - csd_flags
numa: Mark __node_set() as __always_inline
sched/fair: Cleanup: remove duplicate variable declaration
sched/__wake_up_sync_key(): Fix nr_exclusive tasks which lead to WF_SYNC clearing
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 17 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 511 | ||||
-rw-r--r-- | kernel/smp.c | 14 |
4 files changed, 303 insertions, 241 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 57c186d9477e..b8e2162fc803 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2677,7 +2677,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
2677 | if (unlikely(!q)) | 2677 | if (unlikely(!q)) |
2678 | return; | 2678 | return; |
2679 | 2679 | ||
2680 | if (unlikely(!nr_exclusive)) | 2680 | if (unlikely(nr_exclusive != 1)) |
2681 | wake_flags = 0; | 2681 | wake_flags = 0; |
2682 | 2682 | ||
2683 | spin_lock_irqsave(&q->lock, flags); | 2683 | spin_lock_irqsave(&q->lock, flags); |
@@ -4964,7 +4964,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
4964 | SD_BALANCE_FORK | | 4964 | SD_BALANCE_FORK | |
4965 | SD_BALANCE_EXEC | | 4965 | SD_BALANCE_EXEC | |
4966 | SD_SHARE_CPUPOWER | | 4966 | SD_SHARE_CPUPOWER | |
4967 | SD_SHARE_PKG_RESOURCES); | 4967 | SD_SHARE_PKG_RESOURCES | |
4968 | SD_PREFER_SIBLING); | ||
4968 | if (nr_node_ids == 1) | 4969 | if (nr_node_ids == 1) |
4969 | pflags &= ~SD_SERIALIZE; | 4970 | pflags &= ~SD_SERIALIZE; |
4970 | } | 4971 | } |
@@ -5173,6 +5174,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5173 | tmp->parent = parent->parent; | 5174 | tmp->parent = parent->parent; |
5174 | if (parent->parent) | 5175 | if (parent->parent) |
5175 | parent->parent->child = tmp; | 5176 | parent->parent->child = tmp; |
5177 | /* | ||
5178 | * Transfer SD_PREFER_SIBLING down in case of a | ||
5179 | * degenerate parent; the spans match for this | ||
5180 | * so the property transfers. | ||
5181 | */ | ||
5182 | if (parent->flags & SD_PREFER_SIBLING) | ||
5183 | tmp->flags |= SD_PREFER_SIBLING; | ||
5176 | destroy_sched_domain(parent, cpu); | 5184 | destroy_sched_domain(parent, cpu); |
5177 | } else | 5185 | } else |
5178 | tmp = tmp->parent; | 5186 | tmp = tmp->parent; |
@@ -6239,8 +6247,9 @@ match1: | |||
6239 | ; | 6247 | ; |
6240 | } | 6248 | } |
6241 | 6249 | ||
6250 | n = ndoms_cur; | ||
6242 | if (doms_new == NULL) { | 6251 | if (doms_new == NULL) { |
6243 | ndoms_cur = 0; | 6252 | n = 0; |
6244 | doms_new = &fallback_doms; | 6253 | doms_new = &fallback_doms; |
6245 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6254 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
6246 | WARN_ON_ONCE(dattr_new); | 6255 | WARN_ON_ONCE(dattr_new); |
@@ -6248,7 +6257,7 @@ match1: | |||
6248 | 6257 | ||
6249 | /* Build new domains */ | 6258 | /* Build new domains */ |
6250 | for (i = 0; i < ndoms_new; i++) { | 6259 | for (i = 0; i < ndoms_new; i++) { |
6251 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6260 | for (j = 0; j < n && !new_topology; j++) { |
6252 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6261 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
6253 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6262 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
6254 | goto match2; | 6263 | goto match2; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..e89ccefef278 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
122 | * | 122 | * |
123 | */ | 123 | */ |
124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
125 | 125 | ||
126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
127 | } | 127 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8977a249816f..7f0a5e6cdae0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -4277,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p) | |||
4277 | 4277 | ||
4278 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
4279 | /* | 4279 | /* |
4280 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4281 | * during load balancing. | ||
4282 | */ | ||
4283 | struct sd_lb_stats { | ||
4284 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4285 | struct sched_group *this; /* Local group in this sd */ | ||
4286 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4287 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4288 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4289 | |||
4290 | /** Statistics of this group */ | ||
4291 | unsigned long this_load; | ||
4292 | unsigned long this_load_per_task; | ||
4293 | unsigned long this_nr_running; | ||
4294 | unsigned long this_has_capacity; | ||
4295 | unsigned int this_idle_cpus; | ||
4296 | |||
4297 | /* Statistics of the busiest group */ | ||
4298 | unsigned int busiest_idle_cpus; | ||
4299 | unsigned long max_load; | ||
4300 | unsigned long busiest_load_per_task; | ||
4301 | unsigned long busiest_nr_running; | ||
4302 | unsigned long busiest_group_capacity; | ||
4303 | unsigned long busiest_has_capacity; | ||
4304 | unsigned int busiest_group_weight; | ||
4305 | |||
4306 | int group_imb; /* Is there imbalance in this sd */ | ||
4307 | }; | ||
4308 | |||
4309 | /* | ||
4310 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
4311 | */ | 4281 | */ |
4312 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
4313 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
4314 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
4315 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
4316 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
4317 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
4318 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
4319 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
4289 | unsigned int group_capacity; | ||
4290 | unsigned int idle_cpus; | ||
4291 | unsigned int group_weight; | ||
4320 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
4321 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
4322 | }; | 4294 | }; |
4323 | 4295 | ||
4296 | /* | ||
4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4298 | * during load balancing. | ||
4299 | */ | ||
4300 | struct sd_lb_stats { | ||
4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4302 | struct sched_group *local; /* Local group in this sd */ | ||
4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4306 | |||
4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
4309 | }; | ||
4310 | |||
4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
4312 | { | ||
4313 | /* | ||
4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
4316 | * We must however clear busiest_stat::avg_load because | ||
4317 | * update_sd_pick_busiest() reads this before assignment. | ||
4318 | */ | ||
4319 | *sds = (struct sd_lb_stats){ | ||
4320 | .busiest = NULL, | ||
4321 | .local = NULL, | ||
4322 | .total_load = 0UL, | ||
4323 | .total_pwr = 0UL, | ||
4324 | .busiest_stat = { | ||
4325 | .avg_load = 0UL, | ||
4326 | }, | ||
4327 | }; | ||
4328 | } | ||
4329 | |||
4324 | /** | 4330 | /** |
4325 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4326 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
@@ -4504,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4504 | return 0; | 4510 | return 0; |
4505 | } | 4511 | } |
4506 | 4512 | ||
4513 | /* | ||
4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
4516 | * | ||
4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
4519 | * Something like: | ||
4520 | * | ||
4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
4522 | * * * * * | ||
4523 | * | ||
4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
4527 | * | ||
4528 | * The current solution to this issue is detecting the skew in the first group | ||
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
4531 | * sg_imbalanced(). | ||
4532 | * | ||
4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
4536 | * to create an effective group imbalance. | ||
4537 | * | ||
4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
4540 | * subtle and fragile situation. | ||
4541 | */ | ||
4542 | |||
4543 | struct sg_imb_stats { | ||
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | ||
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | ||
4553 | |||
4554 | static inline void | ||
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
4556 | unsigned long load, unsigned long nr_running) | ||
4557 | { | ||
4558 | if (load > sgi->max_cpu_load) | ||
4559 | sgi->max_cpu_load = load; | ||
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | |||
4563 | if (nr_running > sgi->max_nr_running) | ||
4564 | sgi->max_nr_running = nr_running; | ||
4565 | if (sgi->min_nr_running > nr_running) | ||
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | |||
4569 | static inline int | ||
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
4571 | { | ||
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | |||
4585 | return 0; | ||
4586 | } | ||
4587 | |||
4507 | /** | 4588 | /** |
4508 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
4509 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
4510 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
4511 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
4512 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
4513 | * @balance: Should we balance. | ||
4514 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
4515 | */ | 4595 | */ |
4516 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
4517 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
4518 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
4519 | { | 4599 | { |
4520 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
4521 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
4522 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
4523 | unsigned long avg_load_per_task = 0; | ||
4524 | int i; | 4603 | int i; |
4525 | 4604 | ||
4526 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
4527 | balance_cpu = group_balance_cpu(group); | ||
4528 | |||
4529 | /* Tally up the load of all CPUs in the group */ | ||
4530 | max_cpu_load = 0; | ||
4531 | min_cpu_load = ~0UL; | ||
4532 | max_nr_running = 0; | ||
4533 | min_nr_running = ~0UL; | ||
4534 | 4606 | ||
4535 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4536 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
@@ -4539,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4539 | 4611 | ||
4540 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
4541 | if (local_group) { | 4613 | if (local_group) { |
4542 | if (idle_cpu(i) && !first_idle_cpu && | ||
4543 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
4544 | first_idle_cpu = 1; | ||
4545 | balance_cpu = i; | ||
4546 | } | ||
4547 | |||
4548 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
4549 | } else { | 4615 | } else { |
4550 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
4551 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
4552 | max_cpu_load = load; | ||
4553 | if (min_cpu_load > load) | ||
4554 | min_cpu_load = load; | ||
4555 | |||
4556 | if (nr_running > max_nr_running) | ||
4557 | max_nr_running = nr_running; | ||
4558 | if (min_nr_running > nr_running) | ||
4559 | min_nr_running = nr_running; | ||
4560 | } | 4618 | } |
4561 | 4619 | ||
4562 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
@@ -4566,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4566 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
4567 | } | 4625 | } |
4568 | 4626 | ||
4569 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
4570 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
4571 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
4572 | * domains. In the newly idle case, we will allow all the cpu's | ||
4573 | * to do the newly idle load balance. | ||
4574 | */ | ||
4575 | if (local_group) { | ||
4576 | if (env->idle != CPU_NEWLY_IDLE) { | ||
4577 | if (balance_cpu != env->dst_cpu) { | ||
4578 | *balance = 0; | ||
4579 | return; | ||
4580 | } | ||
4581 | update_group_power(env->sd, env->dst_cpu); | ||
4582 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
4583 | update_group_power(env->sd, env->dst_cpu); | ||
4584 | } | ||
4585 | 4630 | ||
4586 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
4587 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
4588 | 4634 | ||
4589 | /* | ||
4590 | * Consider the group unbalanced when the imbalance is larger | ||
4591 | * than the average weight of a task. | ||
4592 | * | ||
4593 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4594 | * might not be a suitable number - should we keep a | ||
4595 | * normalized nr_running number somewhere that negates | ||
4596 | * the hierarchy? | ||
4597 | */ | ||
4598 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
4599 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4600 | 4637 | ||
4601 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); |
4602 | (max_nr_running - min_nr_running) > 1) | 4639 | |
4603 | sgs->group_imb = 1; | 4640 | sgs->group_capacity = |
4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
4604 | 4642 | ||
4605 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
4606 | SCHED_POWER_SCALE); | ||
4607 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
4608 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
4645 | |||
4609 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
4610 | 4647 | ||
4611 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -4630,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4630 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
4631 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
4632 | { | 4669 | { |
4633 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
4634 | return false; | 4671 | return false; |
4635 | 4672 | ||
4636 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
@@ -4663,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4663 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
4664 | */ | 4701 | */ |
4665 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
4666 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
4667 | { | 4704 | { |
4668 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
4669 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
4670 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
4671 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
4672 | 4709 | ||
4673 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
@@ -4676,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4676 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
4677 | 4714 | ||
4678 | do { | 4715 | do { |
4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
4679 | int local_group; | 4717 | int local_group; |
4680 | 4718 | ||
4681 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
4682 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
4683 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
4684 | 4722 | sgs = &sds->local_stat; | |
4685 | if (local_group && !(*balance)) | 4723 | } |
4686 | return; | ||
4687 | 4724 | ||
4688 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
4689 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4690 | 4727 | ||
4691 | /* | 4728 | /* |
4692 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
@@ -4698,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4698 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
4699 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
4700 | */ | 4737 | */ |
4701 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
4702 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
4703 | 4741 | ||
4704 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
4705 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
4706 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
4707 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
4708 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
4709 | sds->this_has_capacity = sgs.group_has_capacity; | ||
4710 | sds->this_idle_cpus = sgs.idle_cpus; | ||
4711 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
4712 | sds->max_load = sgs.avg_load; | ||
4713 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
4714 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
4715 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
4716 | sds->busiest_group_capacity = sgs.group_capacity; | ||
4717 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
4718 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
4719 | sds->busiest_group_weight = sgs.group_weight; | ||
4720 | sds->group_imb = sgs.group_imb; | ||
4721 | } | 4749 | } |
4722 | 4750 | ||
4723 | sg = sg->next; | 4751 | sg = sg->next; |
@@ -4762,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
4762 | return 0; | 4790 | return 0; |
4763 | 4791 | ||
4764 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
4765 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
4794 | SCHED_POWER_SCALE); | ||
4766 | 4795 | ||
4767 | return 1; | 4796 | return 1; |
4768 | } | 4797 | } |
@@ -4780,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4780 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4781 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
4782 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
4812 | struct sg_lb_stats *local, *busiest; | ||
4783 | 4813 | ||
4784 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
4785 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
4786 | if (sds->busiest_load_per_task > | ||
4787 | sds->this_load_per_task) | ||
4788 | imbn = 1; | ||
4789 | } else { | ||
4790 | sds->this_load_per_task = | ||
4791 | cpu_avg_load_per_task(env->dst_cpu); | ||
4792 | } | ||
4793 | 4816 | ||
4794 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4817 | if (!local->sum_nr_running) |
4795 | * SCHED_POWER_SCALE; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
4796 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4819 | else if (busiest->load_per_task > local->load_per_task) |
4820 | imbn = 1; | ||
4797 | 4821 | ||
4798 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4822 | scaled_busy_load_per_task = |
4799 | (scaled_busy_load_per_task * imbn)) { | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
4800 | env->imbalance = sds->busiest_load_per_task; | 4824 | busiest->group_power; |
4825 | |||
4826 | if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= | ||
4827 | (scaled_busy_load_per_task * imbn)) { | ||
4828 | env->imbalance = busiest->load_per_task; | ||
4801 | return; | 4829 | return; |
4802 | } | 4830 | } |
4803 | 4831 | ||
@@ -4807,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4807 | * moving them. | 4835 | * moving them. |
4808 | */ | 4836 | */ |
4809 | 4837 | ||
4810 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
4811 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
4812 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
4813 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
4814 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
4815 | 4843 | ||
4816 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
4817 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4818 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
4819 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
4820 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
4821 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
4850 | busiest->avg_load - tmp); | ||
4851 | } | ||
4822 | 4852 | ||
4823 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
4824 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
4825 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
4826 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
4827 | sds->this->sgp->power; | 4857 | local->group_power; |
4828 | else | 4858 | } else { |
4829 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4830 | sds->this->sgp->power; | 4860 | local->group_power; |
4831 | pwr_move += sds->this->sgp->power * | 4861 | } |
4832 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
4863 | min(local->load_per_task, local->avg_load + tmp); | ||
4833 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
4834 | 4865 | ||
4835 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
4836 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
4837 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
4838 | } | 4869 | } |
4839 | 4870 | ||
4840 | /** | 4871 | /** |
@@ -4846,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4846 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4847 | { | 4878 | { |
4848 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
4880 | struct sg_lb_stats *local, *busiest; | ||
4881 | |||
4882 | local = &sds->local_stat; | ||
4883 | busiest = &sds->busiest_stat; | ||
4849 | 4884 | ||
4850 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4885 | if (busiest->group_imb) { |
4851 | if (sds->group_imb) { | 4886 | /* |
4852 | sds->busiest_load_per_task = | 4887 | * In the group_imb case we cannot rely on group-wide averages |
4853 | min(sds->busiest_load_per_task, sds->avg_load); | 4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
4889 | */ | ||
4890 | busiest->load_per_task = | ||
4891 | min(busiest->load_per_task, sds->avg_load); | ||
4854 | } | 4892 | } |
4855 | 4893 | ||
4856 | /* | 4894 | /* |
@@ -4858,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4858 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
4859 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
4860 | */ | 4898 | */ |
4861 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load < sds->avg_load) { |
4862 | env->imbalance = 0; | 4900 | env->imbalance = 0; |
4863 | return fix_small_imbalance(env, sds); | 4901 | return fix_small_imbalance(env, sds); |
4864 | } | 4902 | } |
4865 | 4903 | ||
4866 | if (!sds->group_imb) { | 4904 | if (!busiest->group_imb) { |
4867 | /* | 4905 | /* |
4868 | * Don't want to pull so many tasks that a group would go idle. | 4906 | * Don't want to pull so many tasks that a group would go idle. |
4907 | * Except of course for the group_imb case, since then we might | ||
4908 | * have to drop below capacity to reach cpu-load equilibrium. | ||
4869 | */ | 4909 | */ |
4870 | load_above_capacity = (sds->busiest_nr_running - | 4910 | load_above_capacity = |
4871 | sds->busiest_group_capacity); | 4911 | (busiest->sum_nr_running - busiest->group_capacity); |
4872 | 4912 | ||
4873 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4913 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
4874 | 4914 | load_above_capacity /= busiest->group_power; | |
4875 | load_above_capacity /= sds->busiest->sgp->power; | ||
4876 | } | 4915 | } |
4877 | 4916 | ||
4878 | /* | 4917 | /* |
@@ -4882,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4882 | * we also don't want to reduce the group load below the group capacity | 4921 | * we also don't want to reduce the group load below the group capacity |
4883 | * (so that we can implement power-savings policies etc). Thus we look | 4922 | * (so that we can implement power-savings policies etc). Thus we look |
4884 | * for the minimum possible imbalance. | 4923 | * for the minimum possible imbalance. |
4885 | * Be careful of negative numbers as they'll appear as very large values | ||
4886 | * with unsigned longs. | ||
4887 | */ | 4924 | */ |
4888 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4925 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
4889 | 4926 | ||
4890 | /* How much load to actually move to equalise the imbalance */ | 4927 | /* How much load to actually move to equalise the imbalance */ |
4891 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4928 | env->imbalance = min( |
4892 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4929 | max_pull * busiest->group_power, |
4893 | / SCHED_POWER_SCALE; | 4930 | (sds->avg_load - local->avg_load) * local->group_power |
4931 | ) / SCHED_POWER_SCALE; | ||
4894 | 4932 | ||
4895 | /* | 4933 | /* |
4896 | * if *imbalance is less than the average load per runnable task | 4934 | * if *imbalance is less than the average load per runnable task |
@@ -4898,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4898 | * a think about bumping its value to force at least one task to be | 4936 | * a think about bumping its value to force at least one task to be |
4899 | * moved | 4937 | * moved |
4900 | */ | 4938 | */ |
4901 | if (env->imbalance < sds->busiest_load_per_task) | 4939 | if (env->imbalance < busiest->load_per_task) |
4902 | return fix_small_imbalance(env, sds); | 4940 | return fix_small_imbalance(env, sds); |
4903 | |||
4904 | } | 4941 | } |
4905 | 4942 | ||
4906 | /******* find_busiest_group() helpers end here *********************/ | 4943 | /******* find_busiest_group() helpers end here *********************/ |
@@ -4916,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4916 | * to restore balance. | 4953 | * to restore balance. |
4917 | * | 4954 | * |
4918 | * @env: The load balancing environment. | 4955 | * @env: The load balancing environment. |
4919 | * @balance: Pointer to a variable indicating if this_cpu | ||
4920 | * is the appropriate cpu to perform load balancing at this_level. | ||
4921 | * | 4956 | * |
4922 | * Return: - The busiest group if imbalance exists. | 4957 | * Return: - The busiest group if imbalance exists. |
4923 | * - If no imbalance and user has opted for power-savings balance, | 4958 | * - If no imbalance and user has opted for power-savings balance, |
4924 | * return the least loaded group whose CPUs can be | 4959 | * return the least loaded group whose CPUs can be |
4925 | * put to idle by rebalancing its tasks onto our group. | 4960 | * put to idle by rebalancing its tasks onto our group. |
4926 | */ | 4961 | */ |
4927 | static struct sched_group * | 4962 | static struct sched_group *find_busiest_group(struct lb_env *env) |
4928 | find_busiest_group(struct lb_env *env, int *balance) | ||
4929 | { | 4963 | { |
4964 | struct sg_lb_stats *local, *busiest; | ||
4930 | struct sd_lb_stats sds; | 4965 | struct sd_lb_stats sds; |
4931 | 4966 | ||
4932 | memset(&sds, 0, sizeof(sds)); | 4967 | init_sd_lb_stats(&sds); |
4933 | 4968 | ||
4934 | /* | 4969 | /* |
4935 | * Compute the various statistics relavent for load balancing at | 4970 | * Compute the various statistics relavent for load balancing at |
4936 | * this level. | 4971 | * this level. |
4937 | */ | 4972 | */ |
4938 | update_sd_lb_stats(env, balance, &sds); | 4973 | update_sd_lb_stats(env, &sds); |
4939 | 4974 | local = &sds.local_stat; | |
4940 | /* | 4975 | busiest = &sds.busiest_stat; |
4941 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
4942 | * this level. | ||
4943 | */ | ||
4944 | if (!(*balance)) | ||
4945 | goto ret; | ||
4946 | 4976 | ||
4947 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4977 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4948 | check_asym_packing(env, &sds)) | 4978 | check_asym_packing(env, &sds)) |
4949 | return sds.busiest; | 4979 | return sds.busiest; |
4950 | 4980 | ||
4951 | /* There is no busy sibling group to pull tasks from */ | 4981 | /* There is no busy sibling group to pull tasks from */ |
4952 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4982 | if (!sds.busiest || busiest->sum_nr_running == 0) |
4953 | goto out_balanced; | 4983 | goto out_balanced; |
4954 | 4984 | ||
4955 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4985 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
4956 | 4986 | ||
4957 | /* | 4987 | /* |
4958 | * If the busiest group is imbalanced the below checks don't | 4988 | * If the busiest group is imbalanced the below checks don't |
4959 | * work because they assumes all things are equal, which typically | 4989 | * work because they assume all things are equal, which typically |
4960 | * isn't true due to cpus_allowed constraints and the like. | 4990 | * isn't true due to cpus_allowed constraints and the like. |
4961 | */ | 4991 | */ |
4962 | if (sds.group_imb) | 4992 | if (busiest->group_imb) |
4963 | goto force_balance; | 4993 | goto force_balance; |
4964 | 4994 | ||
4965 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4995 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4966 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4996 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
4967 | !sds.busiest_has_capacity) | 4997 | !busiest->group_has_capacity) |
4968 | goto force_balance; | 4998 | goto force_balance; |
4969 | 4999 | ||
4970 | /* | 5000 | /* |
4971 | * If the local group is more busy than the selected busiest group | 5001 | * If the local group is more busy than the selected busiest group |
4972 | * don't try and pull any tasks. | 5002 | * don't try and pull any tasks. |
4973 | */ | 5003 | */ |
4974 | if (sds.this_load >= sds.max_load) | 5004 | if (local->avg_load >= busiest->avg_load) |
4975 | goto out_balanced; | 5005 | goto out_balanced; |
4976 | 5006 | ||
4977 | /* | 5007 | /* |
4978 | * Don't pull any tasks if this group is already above the domain | 5008 | * Don't pull any tasks if this group is already above the domain |
4979 | * average load. | 5009 | * average load. |
4980 | */ | 5010 | */ |
4981 | if (sds.this_load >= sds.avg_load) | 5011 | if (local->avg_load >= sds.avg_load) |
4982 | goto out_balanced; | 5012 | goto out_balanced; |
4983 | 5013 | ||
4984 | if (env->idle == CPU_IDLE) { | 5014 | if (env->idle == CPU_IDLE) { |
@@ -4988,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
4988 | * there is no imbalance between this and busiest group | 5018 | * there is no imbalance between this and busiest group |
4989 | * wrt to idle cpu's, it is balanced. | 5019 | * wrt to idle cpu's, it is balanced. |
4990 | */ | 5020 | */ |
4991 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5021 | if ((local->idle_cpus < busiest->idle_cpus) && |
4992 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5022 | busiest->sum_nr_running <= busiest->group_weight) |
4993 | goto out_balanced; | 5023 | goto out_balanced; |
4994 | } else { | 5024 | } else { |
4995 | /* | 5025 | /* |
4996 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5026 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4997 | * imbalance_pct to be conservative. | 5027 | * imbalance_pct to be conservative. |
4998 | */ | 5028 | */ |
4999 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5029 | if (100 * busiest->avg_load <= |
5030 | env->sd->imbalance_pct * local->avg_load) | ||
5000 | goto out_balanced; | 5031 | goto out_balanced; |
5001 | } | 5032 | } |
5002 | 5033 | ||
@@ -5006,7 +5037,6 @@ force_balance: | |||
5006 | return sds.busiest; | 5037 | return sds.busiest; |
5007 | 5038 | ||
5008 | out_balanced: | 5039 | out_balanced: |
5009 | ret: | ||
5010 | env->imbalance = 0; | 5040 | env->imbalance = 0; |
5011 | return NULL; | 5041 | return NULL; |
5012 | } | 5042 | } |
@@ -5018,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5018 | struct sched_group *group) | 5048 | struct sched_group *group) |
5019 | { | 5049 | { |
5020 | struct rq *busiest = NULL, *rq; | 5050 | struct rq *busiest = NULL, *rq; |
5021 | unsigned long max_load = 0; | 5051 | unsigned long busiest_load = 0, busiest_power = 1; |
5022 | int i; | 5052 | int i; |
5023 | 5053 | ||
5024 | for_each_cpu(i, sched_group_cpus(group)) { | 5054 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5025 | unsigned long power = power_of(i); | 5055 | unsigned long power = power_of(i); |
5026 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5056 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
5027 | SCHED_POWER_SCALE); | 5057 | SCHED_POWER_SCALE); |
@@ -5030,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5030 | if (!capacity) | 5060 | if (!capacity) |
5031 | capacity = fix_small_capacity(env->sd, group); | 5061 | capacity = fix_small_capacity(env->sd, group); |
5032 | 5062 | ||
5033 | if (!cpumask_test_cpu(i, env->cpus)) | ||
5034 | continue; | ||
5035 | |||
5036 | rq = cpu_rq(i); | 5063 | rq = cpu_rq(i); |
5037 | wl = weighted_cpuload(i); | 5064 | wl = weighted_cpuload(i); |
5038 | 5065 | ||
@@ -5048,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5048 | * the weighted_cpuload() scaled with the cpu power, so that | 5075 | * the weighted_cpuload() scaled with the cpu power, so that |
5049 | * the load can be moved away from the cpu that is potentially | 5076 | * the load can be moved away from the cpu that is potentially |
5050 | * running at a lower capacity. | 5077 | * running at a lower capacity. |
5078 | * | ||
5079 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
5080 | * multiplication to rid ourselves of the division works out | ||
5081 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
5082 | * previous maximum. | ||
5051 | */ | 5083 | */ |
5052 | wl = (wl * SCHED_POWER_SCALE) / power; | 5084 | if (wl * busiest_power > busiest_load * power) { |
5053 | 5085 | busiest_load = wl; | |
5054 | if (wl > max_load) { | 5086 | busiest_power = power; |
5055 | max_load = wl; | ||
5056 | busiest = rq; | 5087 | busiest = rq; |
5057 | } | 5088 | } |
5058 | } | 5089 | } |
@@ -5089,13 +5120,47 @@ static int need_active_balance(struct lb_env *env) | |||
5089 | 5120 | ||
5090 | static int active_load_balance_cpu_stop(void *data); | 5121 | static int active_load_balance_cpu_stop(void *data); |
5091 | 5122 | ||
5123 | static int should_we_balance(struct lb_env *env) | ||
5124 | { | ||
5125 | struct sched_group *sg = env->sd->groups; | ||
5126 | struct cpumask *sg_cpus, *sg_mask; | ||
5127 | int cpu, balance_cpu = -1; | ||
5128 | |||
5129 | /* | ||
5130 | * In the newly idle case, we will allow all the cpu's | ||
5131 | * to do the newly idle load balance. | ||
5132 | */ | ||
5133 | if (env->idle == CPU_NEWLY_IDLE) | ||
5134 | return 1; | ||
5135 | |||
5136 | sg_cpus = sched_group_cpus(sg); | ||
5137 | sg_mask = sched_group_mask(sg); | ||
5138 | /* Try to find first idle cpu */ | ||
5139 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
5140 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
5141 | continue; | ||
5142 | |||
5143 | balance_cpu = cpu; | ||
5144 | break; | ||
5145 | } | ||
5146 | |||
5147 | if (balance_cpu == -1) | ||
5148 | balance_cpu = group_balance_cpu(sg); | ||
5149 | |||
5150 | /* | ||
5151 | * First idle cpu or the first cpu(busiest) in this sched group | ||
5152 | * is eligible for doing load balancing at this and above domains. | ||
5153 | */ | ||
5154 | return balance_cpu != env->dst_cpu; | ||
5155 | } | ||
5156 | |||
5092 | /* | 5157 | /* |
5093 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5158 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
5094 | * tasks if there is an imbalance. | 5159 | * tasks if there is an imbalance. |
5095 | */ | 5160 | */ |
5096 | static int load_balance(int this_cpu, struct rq *this_rq, | 5161 | static int load_balance(int this_cpu, struct rq *this_rq, |
5097 | struct sched_domain *sd, enum cpu_idle_type idle, | 5162 | struct sched_domain *sd, enum cpu_idle_type idle, |
5098 | int *balance) | 5163 | int *continue_balancing) |
5099 | { | 5164 | { |
5100 | int ld_moved, cur_ld_moved, active_balance = 0; | 5165 | int ld_moved, cur_ld_moved, active_balance = 0; |
5101 | struct sched_group *group; | 5166 | struct sched_group *group; |
@@ -5125,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5125 | schedstat_inc(sd, lb_count[idle]); | 5190 | schedstat_inc(sd, lb_count[idle]); |
5126 | 5191 | ||
5127 | redo: | 5192 | redo: |
5128 | group = find_busiest_group(&env, balance); | 5193 | if (!should_we_balance(&env)) { |
5129 | 5194 | *continue_balancing = 0; | |
5130 | if (*balance == 0) | ||
5131 | goto out_balanced; | 5195 | goto out_balanced; |
5196 | } | ||
5132 | 5197 | ||
5198 | group = find_busiest_group(&env); | ||
5133 | if (!group) { | 5199 | if (!group) { |
5134 | schedstat_inc(sd, lb_nobusyg[idle]); | 5200 | schedstat_inc(sd, lb_nobusyg[idle]); |
5135 | goto out_balanced; | 5201 | goto out_balanced; |
@@ -5341,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5341 | rcu_read_lock(); | 5407 | rcu_read_lock(); |
5342 | for_each_domain(this_cpu, sd) { | 5408 | for_each_domain(this_cpu, sd) { |
5343 | unsigned long interval; | 5409 | unsigned long interval; |
5344 | int balance = 1; | 5410 | int continue_balancing = 1; |
5345 | 5411 | ||
5346 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5412 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5347 | continue; | 5413 | continue; |
@@ -5349,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5349 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5415 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
5350 | /* If we've pulled tasks over stop searching: */ | 5416 | /* If we've pulled tasks over stop searching: */ |
5351 | pulled_task = load_balance(this_cpu, this_rq, | 5417 | pulled_task = load_balance(this_cpu, this_rq, |
5352 | sd, CPU_NEWLY_IDLE, &balance); | 5418 | sd, CPU_NEWLY_IDLE, |
5419 | &continue_balancing); | ||
5353 | } | 5420 | } |
5354 | 5421 | ||
5355 | interval = msecs_to_jiffies(sd->balance_interval); | 5422 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5587,7 +5654,7 @@ void update_max_interval(void) | |||
5587 | */ | 5654 | */ |
5588 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5589 | { | 5656 | { |
5590 | int balance = 1; | 5657 | int continue_balancing = 1; |
5591 | struct rq *rq = cpu_rq(cpu); | 5658 | struct rq *rq = cpu_rq(cpu); |
5592 | unsigned long interval; | 5659 | unsigned long interval; |
5593 | struct sched_domain *sd; | 5660 | struct sched_domain *sd; |
@@ -5619,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5619 | } | 5686 | } |
5620 | 5687 | ||
5621 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5688 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5622 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5689 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5623 | /* | 5690 | /* |
5624 | * The LBF_SOME_PINNED logic could have changed | 5691 | * The LBF_SOME_PINNED logic could have changed |
5625 | * env->dst_cpu, so we can't know our idle | 5692 | * env->dst_cpu, so we can't know our idle |
@@ -5642,7 +5709,7 @@ out: | |||
5642 | * CPU in our sched group which is doing load balancing more | 5709 | * CPU in our sched group which is doing load balancing more |
5643 | * actively. | 5710 | * actively. |
5644 | */ | 5711 | */ |
5645 | if (!balance) | 5712 | if (!continue_balancing) |
5646 | break; | 5713 | break; |
5647 | } | 5714 | } |
5648 | rcu_read_unlock(); | 5715 | rcu_read_unlock(); |
@@ -5938,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5938 | * and ensure we don't carry in an old decay_count if we | 6005 | * and ensure we don't carry in an old decay_count if we |
5939 | * switch back. | 6006 | * switch back. |
5940 | */ | 6007 | */ |
5941 | if (p->se.avg.decay_count) { | 6008 | if (se->avg.decay_count) { |
5942 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6009 | __synchronize_entity_decay(se); |
5943 | __synchronize_entity_decay(&p->se); | 6010 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
5944 | subtract_blocked_load_contrib(cfs_rq, | ||
5945 | p->se.avg.load_avg_contrib); | ||
5946 | } | 6011 | } |
5947 | #endif | 6012 | #endif |
5948 | } | 6013 | } |
diff --git a/kernel/smp.c b/kernel/smp.c index b1c9034bdfcb..449b707fc20d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
186 | 186 | ||
187 | while (!list_empty(&list)) { | 187 | while (!list_empty(&list)) { |
188 | struct call_single_data *csd; | 188 | struct call_single_data *csd; |
189 | unsigned int csd_flags; | ||
190 | 189 | ||
191 | csd = list_entry(list.next, struct call_single_data, list); | 190 | csd = list_entry(list.next, struct call_single_data, list); |
192 | list_del(&csd->list); | 191 | list_del(&csd->list); |
193 | 192 | ||
194 | /* | ||
195 | * 'csd' can be invalid after this call if flags == 0 | ||
196 | * (when called through generic_exec_single()), | ||
197 | * so save them away before making the call: | ||
198 | */ | ||
199 | csd_flags = csd->flags; | ||
200 | |||
201 | csd->func(csd->info); | 193 | csd->func(csd->info); |
202 | 194 | ||
203 | /* | 195 | csd_unlock(csd); |
204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
205 | */ | ||
206 | if (csd_flags & CSD_FLAG_LOCK) | ||
207 | csd_unlock(csd); | ||
208 | } | 196 | } |
209 | } | 197 | } |
210 | 198 | ||