diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-04 11:36:35 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-04 11:36:35 -0400 |
| commit | 5e0b3a4e88012d259e8b2c0f02f393c79686daf9 (patch) | |
| tree | 1c6d7be145a7cce77996049eb78877ed95e87a4f /kernel | |
| parent | 0d99b7087324978b09b59d8c7a0736214c4a42b1 (diff) | |
| parent | 10866e62e8a6907d9072f10f9a0561db0c0cf50b (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"Various optimizations, cleanups and smaller fixes - no major changes
in scheduler behavior"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix the sd_parent_degenerate() code
sched/fair: Rework and comment the group_imb code
sched/fair: Optimize find_busiest_queue()
sched/fair: Make group power more consistent
sched/fair: Remove duplicate load_per_task computations
sched/fair: Shrink sg_lb_stats and play memset games
sched: Clean-up struct sd_lb_stat
sched: Factor out code to should_we_balance()
sched: Remove one division operation in find_busiest_queue()
sched/cputime: Use this_cpu_add() in task_group_account_field()
cpumask: Fix cpumask leak in partition_sched_domains()
sched/x86: Optimize switch_mm() for multi-threaded workloads
generic-ipi: Kill unnecessary variable - csd_flags
numa: Mark __node_set() as __always_inline
sched/fair: Cleanup: remove duplicate variable declaration
sched/__wake_up_sync_key(): Fix nr_exclusive tasks which lead to WF_SYNC clearing
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/core.c | 17 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 2 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 511 | ||||
| -rw-r--r-- | kernel/smp.c | 14 |
4 files changed, 303 insertions, 241 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 57c186d9477e..b8e2162fc803 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -2677,7 +2677,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
| 2677 | if (unlikely(!q)) | 2677 | if (unlikely(!q)) |
| 2678 | return; | 2678 | return; |
| 2679 | 2679 | ||
| 2680 | if (unlikely(!nr_exclusive)) | 2680 | if (unlikely(nr_exclusive != 1)) |
| 2681 | wake_flags = 0; | 2681 | wake_flags = 0; |
| 2682 | 2682 | ||
| 2683 | spin_lock_irqsave(&q->lock, flags); | 2683 | spin_lock_irqsave(&q->lock, flags); |
| @@ -4964,7 +4964,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 4964 | SD_BALANCE_FORK | | 4964 | SD_BALANCE_FORK | |
| 4965 | SD_BALANCE_EXEC | | 4965 | SD_BALANCE_EXEC | |
| 4966 | SD_SHARE_CPUPOWER | | 4966 | SD_SHARE_CPUPOWER | |
| 4967 | SD_SHARE_PKG_RESOURCES); | 4967 | SD_SHARE_PKG_RESOURCES | |
| 4968 | SD_PREFER_SIBLING); | ||
| 4968 | if (nr_node_ids == 1) | 4969 | if (nr_node_ids == 1) |
| 4969 | pflags &= ~SD_SERIALIZE; | 4970 | pflags &= ~SD_SERIALIZE; |
| 4970 | } | 4971 | } |
| @@ -5173,6 +5174,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5173 | tmp->parent = parent->parent; | 5174 | tmp->parent = parent->parent; |
| 5174 | if (parent->parent) | 5175 | if (parent->parent) |
| 5175 | parent->parent->child = tmp; | 5176 | parent->parent->child = tmp; |
| 5177 | /* | ||
| 5178 | * Transfer SD_PREFER_SIBLING down in case of a | ||
| 5179 | * degenerate parent; the spans match for this | ||
| 5180 | * so the property transfers. | ||
| 5181 | */ | ||
| 5182 | if (parent->flags & SD_PREFER_SIBLING) | ||
| 5183 | tmp->flags |= SD_PREFER_SIBLING; | ||
| 5176 | destroy_sched_domain(parent, cpu); | 5184 | destroy_sched_domain(parent, cpu); |
| 5177 | } else | 5185 | } else |
| 5178 | tmp = tmp->parent; | 5186 | tmp = tmp->parent; |
| @@ -6239,8 +6247,9 @@ match1: | |||
| 6239 | ; | 6247 | ; |
| 6240 | } | 6248 | } |
| 6241 | 6249 | ||
| 6250 | n = ndoms_cur; | ||
| 6242 | if (doms_new == NULL) { | 6251 | if (doms_new == NULL) { |
| 6243 | ndoms_cur = 0; | 6252 | n = 0; |
| 6244 | doms_new = &fallback_doms; | 6253 | doms_new = &fallback_doms; |
| 6245 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6254 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
| 6246 | WARN_ON_ONCE(dattr_new); | 6255 | WARN_ON_ONCE(dattr_new); |
| @@ -6248,7 +6257,7 @@ match1: | |||
| 6248 | 6257 | ||
| 6249 | /* Build new domains */ | 6258 | /* Build new domains */ |
| 6250 | for (i = 0; i < ndoms_new; i++) { | 6259 | for (i = 0; i < ndoms_new; i++) { |
| 6251 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6260 | for (j = 0; j < n && !new_topology; j++) { |
| 6252 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6261 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
| 6253 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6262 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
| 6254 | goto match2; | 6263 | goto match2; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..e89ccefef278 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
| 121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
| 122 | * | 122 | * |
| 123 | */ | 123 | */ |
| 124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
| 125 | 125 | ||
| 126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
| 127 | } | 127 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8977a249816f..7f0a5e6cdae0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -4277,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 4277 | 4277 | ||
| 4278 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
| 4279 | /* | 4279 | /* |
| 4280 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 4281 | * during load balancing. | ||
| 4282 | */ | ||
| 4283 | struct sd_lb_stats { | ||
| 4284 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 4285 | struct sched_group *this; /* Local group in this sd */ | ||
| 4286 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 4287 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 4288 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 4289 | |||
| 4290 | /** Statistics of this group */ | ||
| 4291 | unsigned long this_load; | ||
| 4292 | unsigned long this_load_per_task; | ||
| 4293 | unsigned long this_nr_running; | ||
| 4294 | unsigned long this_has_capacity; | ||
| 4295 | unsigned int this_idle_cpus; | ||
| 4296 | |||
| 4297 | /* Statistics of the busiest group */ | ||
| 4298 | unsigned int busiest_idle_cpus; | ||
| 4299 | unsigned long max_load; | ||
| 4300 | unsigned long busiest_load_per_task; | ||
| 4301 | unsigned long busiest_nr_running; | ||
| 4302 | unsigned long busiest_group_capacity; | ||
| 4303 | unsigned long busiest_has_capacity; | ||
| 4304 | unsigned int busiest_group_weight; | ||
| 4305 | |||
| 4306 | int group_imb; /* Is there imbalance in this sd */ | ||
| 4307 | }; | ||
| 4308 | |||
| 4309 | /* | ||
| 4310 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 4311 | */ | 4281 | */ |
| 4312 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
| 4313 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
| 4314 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
| 4315 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
| 4316 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 4317 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
| 4318 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
| 4319 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 4289 | unsigned int group_capacity; | ||
| 4290 | unsigned int idle_cpus; | ||
| 4291 | unsigned int group_weight; | ||
| 4320 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
| 4321 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 4322 | }; | 4294 | }; |
| 4323 | 4295 | ||
| 4296 | /* | ||
| 4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 4298 | * during load balancing. | ||
| 4299 | */ | ||
| 4300 | struct sd_lb_stats { | ||
| 4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 4302 | struct sched_group *local; /* Local group in this sd */ | ||
| 4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 4306 | |||
| 4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
| 4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
| 4309 | }; | ||
| 4310 | |||
| 4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
| 4312 | { | ||
| 4313 | /* | ||
| 4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
| 4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
| 4316 | * We must however clear busiest_stat::avg_load because | ||
| 4317 | * update_sd_pick_busiest() reads this before assignment. | ||
| 4318 | */ | ||
| 4319 | *sds = (struct sd_lb_stats){ | ||
| 4320 | .busiest = NULL, | ||
| 4321 | .local = NULL, | ||
| 4322 | .total_load = 0UL, | ||
| 4323 | .total_pwr = 0UL, | ||
| 4324 | .busiest_stat = { | ||
| 4325 | .avg_load = 0UL, | ||
| 4326 | }, | ||
| 4327 | }; | ||
| 4328 | } | ||
| 4329 | |||
| 4324 | /** | 4330 | /** |
| 4325 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 4326 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
| @@ -4504,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4504 | return 0; | 4510 | return 0; |
| 4505 | } | 4511 | } |
| 4506 | 4512 | ||
| 4513 | /* | ||
| 4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
| 4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
| 4516 | * | ||
| 4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
| 4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
| 4519 | * Something like: | ||
| 4520 | * | ||
| 4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
| 4522 | * * * * * | ||
| 4523 | * | ||
| 4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
| 4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
| 4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
| 4527 | * | ||
| 4528 | * The current solution to this issue is detecting the skew in the first group | ||
| 4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
| 4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
| 4531 | * sg_imbalanced(). | ||
| 4532 | * | ||
| 4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
| 4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
| 4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
| 4536 | * to create an effective group imbalance. | ||
| 4537 | * | ||
| 4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
| 4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
| 4540 | * subtle and fragile situation. | ||
| 4541 | */ | ||
| 4542 | |||
| 4543 | struct sg_imb_stats { | ||
| 4544 | unsigned long max_nr_running, min_nr_running; | ||
| 4545 | unsigned long max_cpu_load, min_cpu_load; | ||
| 4546 | }; | ||
| 4547 | |||
| 4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
| 4549 | { | ||
| 4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
| 4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
| 4552 | } | ||
| 4553 | |||
| 4554 | static inline void | ||
| 4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
| 4556 | unsigned long load, unsigned long nr_running) | ||
| 4557 | { | ||
| 4558 | if (load > sgi->max_cpu_load) | ||
| 4559 | sgi->max_cpu_load = load; | ||
| 4560 | if (sgi->min_cpu_load > load) | ||
| 4561 | sgi->min_cpu_load = load; | ||
| 4562 | |||
| 4563 | if (nr_running > sgi->max_nr_running) | ||
| 4564 | sgi->max_nr_running = nr_running; | ||
| 4565 | if (sgi->min_nr_running > nr_running) | ||
| 4566 | sgi->min_nr_running = nr_running; | ||
| 4567 | } | ||
| 4568 | |||
| 4569 | static inline int | ||
| 4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
| 4571 | { | ||
| 4572 | /* | ||
| 4573 | * Consider the group unbalanced when the imbalance is larger | ||
| 4574 | * than the average weight of a task. | ||
| 4575 | * | ||
| 4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4577 | * might not be a suitable number - should we keep a | ||
| 4578 | * normalized nr_running number somewhere that negates | ||
| 4579 | * the hierarchy? | ||
| 4580 | */ | ||
| 4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
| 4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
| 4583 | return 1; | ||
| 4584 | |||
| 4585 | return 0; | ||
| 4586 | } | ||
| 4587 | |||
| 4507 | /** | 4588 | /** |
| 4508 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 4509 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
| 4510 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
| 4511 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
| 4512 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
| 4513 | * @balance: Should we balance. | ||
| 4514 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
| 4515 | */ | 4595 | */ |
| 4516 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
| 4517 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
| 4518 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
| 4519 | { | 4599 | { |
| 4520 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
| 4521 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
| 4522 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
| 4523 | unsigned long avg_load_per_task = 0; | ||
| 4524 | int i; | 4603 | int i; |
| 4525 | 4604 | ||
| 4526 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
| 4527 | balance_cpu = group_balance_cpu(group); | ||
| 4528 | |||
| 4529 | /* Tally up the load of all CPUs in the group */ | ||
| 4530 | max_cpu_load = 0; | ||
| 4531 | min_cpu_load = ~0UL; | ||
| 4532 | max_nr_running = 0; | ||
| 4533 | min_nr_running = ~0UL; | ||
| 4534 | 4606 | ||
| 4535 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4536 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
| @@ -4539,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4539 | 4611 | ||
| 4540 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
| 4541 | if (local_group) { | 4613 | if (local_group) { |
| 4542 | if (idle_cpu(i) && !first_idle_cpu && | ||
| 4543 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
| 4544 | first_idle_cpu = 1; | ||
| 4545 | balance_cpu = i; | ||
| 4546 | } | ||
| 4547 | |||
| 4548 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
| 4549 | } else { | 4615 | } else { |
| 4550 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
| 4551 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
| 4552 | max_cpu_load = load; | ||
| 4553 | if (min_cpu_load > load) | ||
| 4554 | min_cpu_load = load; | ||
| 4555 | |||
| 4556 | if (nr_running > max_nr_running) | ||
| 4557 | max_nr_running = nr_running; | ||
| 4558 | if (min_nr_running > nr_running) | ||
| 4559 | min_nr_running = nr_running; | ||
| 4560 | } | 4618 | } |
| 4561 | 4619 | ||
| 4562 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
| @@ -4566,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4566 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
| 4567 | } | 4625 | } |
| 4568 | 4626 | ||
| 4569 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
| 4570 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
| 4571 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
| 4572 | * domains. In the newly idle case, we will allow all the cpu's | ||
| 4573 | * to do the newly idle load balance. | ||
| 4574 | */ | ||
| 4575 | if (local_group) { | ||
| 4576 | if (env->idle != CPU_NEWLY_IDLE) { | ||
| 4577 | if (balance_cpu != env->dst_cpu) { | ||
| 4578 | *balance = 0; | ||
| 4579 | return; | ||
| 4580 | } | ||
| 4581 | update_group_power(env->sd, env->dst_cpu); | ||
| 4582 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
| 4583 | update_group_power(env->sd, env->dst_cpu); | ||
| 4584 | } | ||
| 4585 | 4630 | ||
| 4586 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
| 4587 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
| 4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
| 4588 | 4634 | ||
| 4589 | /* | ||
| 4590 | * Consider the group unbalanced when the imbalance is larger | ||
| 4591 | * than the average weight of a task. | ||
| 4592 | * | ||
| 4593 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4594 | * might not be a suitable number - should we keep a | ||
| 4595 | * normalized nr_running number somewhere that negates | ||
| 4596 | * the hierarchy? | ||
| 4597 | */ | ||
| 4598 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
| 4599 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 4600 | 4637 | ||
| 4601 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); |
| 4602 | (max_nr_running - min_nr_running) > 1) | 4639 | |
| 4603 | sgs->group_imb = 1; | 4640 | sgs->group_capacity = |
| 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
| 4604 | 4642 | ||
| 4605 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
| 4606 | SCHED_POWER_SCALE); | ||
| 4607 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
| 4608 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
| 4645 | |||
| 4609 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
| 4610 | 4647 | ||
| 4611 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
| @@ -4630,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 4630 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
| 4631 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
| 4632 | { | 4669 | { |
| 4633 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
| 4634 | return false; | 4671 | return false; |
| 4635 | 4672 | ||
| 4636 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
| @@ -4663,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 4663 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
| 4664 | */ | 4701 | */ |
| 4665 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
| 4666 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
| 4667 | { | 4704 | { |
| 4668 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
| 4669 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
| 4670 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
| 4671 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
| 4672 | 4709 | ||
| 4673 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
| @@ -4676,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4676 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
| 4677 | 4714 | ||
| 4678 | do { | 4715 | do { |
| 4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
| 4679 | int local_group; | 4717 | int local_group; |
| 4680 | 4718 | ||
| 4681 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
| 4682 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
| 4683 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
| 4684 | 4722 | sgs = &sds->local_stat; | |
| 4685 | if (local_group && !(*balance)) | 4723 | } |
| 4686 | return; | ||
| 4687 | 4724 | ||
| 4688 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
| 4689 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
| 4690 | 4727 | ||
| 4691 | /* | 4728 | /* |
| 4692 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
| @@ -4698,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4698 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
| 4699 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
| 4700 | */ | 4737 | */ |
| 4701 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
| 4702 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
| 4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
| 4703 | 4741 | ||
| 4704 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
| 4705 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
| 4706 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
| 4707 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
| 4708 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 4709 | sds->this_has_capacity = sgs.group_has_capacity; | ||
| 4710 | sds->this_idle_cpus = sgs.idle_cpus; | ||
| 4711 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
| 4712 | sds->max_load = sgs.avg_load; | ||
| 4713 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
| 4714 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
| 4715 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
| 4716 | sds->busiest_group_capacity = sgs.group_capacity; | ||
| 4717 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
| 4718 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
| 4719 | sds->busiest_group_weight = sgs.group_weight; | ||
| 4720 | sds->group_imb = sgs.group_imb; | ||
| 4721 | } | 4749 | } |
| 4722 | 4750 | ||
| 4723 | sg = sg->next; | 4751 | sg = sg->next; |
| @@ -4762,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4762 | return 0; | 4790 | return 0; |
| 4763 | 4791 | ||
| 4764 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
| 4765 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
| 4794 | SCHED_POWER_SCALE); | ||
| 4766 | 4795 | ||
| 4767 | return 1; | 4796 | return 1; |
| 4768 | } | 4797 | } |
| @@ -4780,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4780 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
| 4781 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
| 4782 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
| 4812 | struct sg_lb_stats *local, *busiest; | ||
| 4783 | 4813 | ||
| 4784 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
| 4785 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
| 4786 | if (sds->busiest_load_per_task > | ||
| 4787 | sds->this_load_per_task) | ||
| 4788 | imbn = 1; | ||
| 4789 | } else { | ||
| 4790 | sds->this_load_per_task = | ||
| 4791 | cpu_avg_load_per_task(env->dst_cpu); | ||
| 4792 | } | ||
| 4793 | 4816 | ||
| 4794 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4817 | if (!local->sum_nr_running) |
| 4795 | * SCHED_POWER_SCALE; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
| 4796 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4819 | else if (busiest->load_per_task > local->load_per_task) |
| 4820 | imbn = 1; | ||
| 4797 | 4821 | ||
| 4798 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4822 | scaled_busy_load_per_task = |
| 4799 | (scaled_busy_load_per_task * imbn)) { | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4800 | env->imbalance = sds->busiest_load_per_task; | 4824 | busiest->group_power; |
| 4825 | |||
| 4826 | if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= | ||
| 4827 | (scaled_busy_load_per_task * imbn)) { | ||
| 4828 | env->imbalance = busiest->load_per_task; | ||
| 4801 | return; | 4829 | return; |
| 4802 | } | 4830 | } |
| 4803 | 4831 | ||
| @@ -4807,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4807 | * moving them. | 4835 | * moving them. |
| 4808 | */ | 4836 | */ |
| 4809 | 4837 | ||
| 4810 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
| 4811 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
| 4812 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
| 4813 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
| 4814 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
| 4815 | 4843 | ||
| 4816 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
| 4817 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4818 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
| 4819 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
| 4820 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
| 4821 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
| 4850 | busiest->avg_load - tmp); | ||
| 4851 | } | ||
| 4822 | 4852 | ||
| 4823 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
| 4824 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
| 4825 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
| 4826 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
| 4827 | sds->this->sgp->power; | 4857 | local->group_power; |
| 4828 | else | 4858 | } else { |
| 4829 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4830 | sds->this->sgp->power; | 4860 | local->group_power; |
| 4831 | pwr_move += sds->this->sgp->power * | 4861 | } |
| 4832 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
| 4863 | min(local->load_per_task, local->avg_load + tmp); | ||
| 4833 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
| 4834 | 4865 | ||
| 4835 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
| 4836 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
| 4837 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
| 4838 | } | 4869 | } |
| 4839 | 4870 | ||
| 4840 | /** | 4871 | /** |
| @@ -4846,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4846 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
| 4847 | { | 4878 | { |
| 4848 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
| 4880 | struct sg_lb_stats *local, *busiest; | ||
| 4881 | |||
| 4882 | local = &sds->local_stat; | ||
| 4883 | busiest = &sds->busiest_stat; | ||
| 4849 | 4884 | ||
| 4850 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4885 | if (busiest->group_imb) { |
| 4851 | if (sds->group_imb) { | 4886 | /* |
| 4852 | sds->busiest_load_per_task = | 4887 | * In the group_imb case we cannot rely on group-wide averages |
| 4853 | min(sds->busiest_load_per_task, sds->avg_load); | 4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
| 4889 | */ | ||
| 4890 | busiest->load_per_task = | ||
| 4891 | min(busiest->load_per_task, sds->avg_load); | ||
| 4854 | } | 4892 | } |
| 4855 | 4893 | ||
| 4856 | /* | 4894 | /* |
| @@ -4858,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4858 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
| 4859 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
| 4860 | */ | 4898 | */ |
| 4861 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load < sds->avg_load) { |
| 4862 | env->imbalance = 0; | 4900 | env->imbalance = 0; |
| 4863 | return fix_small_imbalance(env, sds); | 4901 | return fix_small_imbalance(env, sds); |
| 4864 | } | 4902 | } |
| 4865 | 4903 | ||
| 4866 | if (!sds->group_imb) { | 4904 | if (!busiest->group_imb) { |
| 4867 | /* | 4905 | /* |
| 4868 | * Don't want to pull so many tasks that a group would go idle. | 4906 | * Don't want to pull so many tasks that a group would go idle. |
| 4907 | * Except of course for the group_imb case, since then we might | ||
| 4908 | * have to drop below capacity to reach cpu-load equilibrium. | ||
| 4869 | */ | 4909 | */ |
| 4870 | load_above_capacity = (sds->busiest_nr_running - | 4910 | load_above_capacity = |
| 4871 | sds->busiest_group_capacity); | 4911 | (busiest->sum_nr_running - busiest->group_capacity); |
| 4872 | 4912 | ||
| 4873 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4913 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
| 4874 | 4914 | load_above_capacity /= busiest->group_power; | |
| 4875 | load_above_capacity /= sds->busiest->sgp->power; | ||
| 4876 | } | 4915 | } |
| 4877 | 4916 | ||
| 4878 | /* | 4917 | /* |
| @@ -4882,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4882 | * we also don't want to reduce the group load below the group capacity | 4921 | * we also don't want to reduce the group load below the group capacity |
| 4883 | * (so that we can implement power-savings policies etc). Thus we look | 4922 | * (so that we can implement power-savings policies etc). Thus we look |
| 4884 | * for the minimum possible imbalance. | 4923 | * for the minimum possible imbalance. |
| 4885 | * Be careful of negative numbers as they'll appear as very large values | ||
| 4886 | * with unsigned longs. | ||
| 4887 | */ | 4924 | */ |
| 4888 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4925 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
| 4889 | 4926 | ||
| 4890 | /* How much load to actually move to equalise the imbalance */ | 4927 | /* How much load to actually move to equalise the imbalance */ |
| 4891 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4928 | env->imbalance = min( |
| 4892 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4929 | max_pull * busiest->group_power, |
| 4893 | / SCHED_POWER_SCALE; | 4930 | (sds->avg_load - local->avg_load) * local->group_power |
| 4931 | ) / SCHED_POWER_SCALE; | ||
| 4894 | 4932 | ||
| 4895 | /* | 4933 | /* |
| 4896 | * if *imbalance is less than the average load per runnable task | 4934 | * if *imbalance is less than the average load per runnable task |
| @@ -4898,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4898 | * a think about bumping its value to force at least one task to be | 4936 | * a think about bumping its value to force at least one task to be |
| 4899 | * moved | 4937 | * moved |
| 4900 | */ | 4938 | */ |
| 4901 | if (env->imbalance < sds->busiest_load_per_task) | 4939 | if (env->imbalance < busiest->load_per_task) |
| 4902 | return fix_small_imbalance(env, sds); | 4940 | return fix_small_imbalance(env, sds); |
| 4903 | |||
| 4904 | } | 4941 | } |
| 4905 | 4942 | ||
| 4906 | /******* find_busiest_group() helpers end here *********************/ | 4943 | /******* find_busiest_group() helpers end here *********************/ |
| @@ -4916,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4916 | * to restore balance. | 4953 | * to restore balance. |
| 4917 | * | 4954 | * |
| 4918 | * @env: The load balancing environment. | 4955 | * @env: The load balancing environment. |
| 4919 | * @balance: Pointer to a variable indicating if this_cpu | ||
| 4920 | * is the appropriate cpu to perform load balancing at this_level. | ||
| 4921 | * | 4956 | * |
| 4922 | * Return: - The busiest group if imbalance exists. | 4957 | * Return: - The busiest group if imbalance exists. |
| 4923 | * - If no imbalance and user has opted for power-savings balance, | 4958 | * - If no imbalance and user has opted for power-savings balance, |
| 4924 | * return the least loaded group whose CPUs can be | 4959 | * return the least loaded group whose CPUs can be |
| 4925 | * put to idle by rebalancing its tasks onto our group. | 4960 | * put to idle by rebalancing its tasks onto our group. |
| 4926 | */ | 4961 | */ |
| 4927 | static struct sched_group * | 4962 | static struct sched_group *find_busiest_group(struct lb_env *env) |
| 4928 | find_busiest_group(struct lb_env *env, int *balance) | ||
| 4929 | { | 4963 | { |
| 4964 | struct sg_lb_stats *local, *busiest; | ||
| 4930 | struct sd_lb_stats sds; | 4965 | struct sd_lb_stats sds; |
| 4931 | 4966 | ||
| 4932 | memset(&sds, 0, sizeof(sds)); | 4967 | init_sd_lb_stats(&sds); |
| 4933 | 4968 | ||
| 4934 | /* | 4969 | /* |
| 4935 | * Compute the various statistics relavent for load balancing at | 4970 | * Compute the various statistics relavent for load balancing at |
| 4936 | * this level. | 4971 | * this level. |
| 4937 | */ | 4972 | */ |
| 4938 | update_sd_lb_stats(env, balance, &sds); | 4973 | update_sd_lb_stats(env, &sds); |
| 4939 | 4974 | local = &sds.local_stat; | |
| 4940 | /* | 4975 | busiest = &sds.busiest_stat; |
| 4941 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
| 4942 | * this level. | ||
| 4943 | */ | ||
| 4944 | if (!(*balance)) | ||
| 4945 | goto ret; | ||
| 4946 | 4976 | ||
| 4947 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4977 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
| 4948 | check_asym_packing(env, &sds)) | 4978 | check_asym_packing(env, &sds)) |
| 4949 | return sds.busiest; | 4979 | return sds.busiest; |
| 4950 | 4980 | ||
| 4951 | /* There is no busy sibling group to pull tasks from */ | 4981 | /* There is no busy sibling group to pull tasks from */ |
| 4952 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4982 | if (!sds.busiest || busiest->sum_nr_running == 0) |
| 4953 | goto out_balanced; | 4983 | goto out_balanced; |
| 4954 | 4984 | ||
| 4955 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4985 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
| 4956 | 4986 | ||
| 4957 | /* | 4987 | /* |
| 4958 | * If the busiest group is imbalanced the below checks don't | 4988 | * If the busiest group is imbalanced the below checks don't |
| 4959 | * work because they assumes all things are equal, which typically | 4989 | * work because they assume all things are equal, which typically |
| 4960 | * isn't true due to cpus_allowed constraints and the like. | 4990 | * isn't true due to cpus_allowed constraints and the like. |
| 4961 | */ | 4991 | */ |
| 4962 | if (sds.group_imb) | 4992 | if (busiest->group_imb) |
| 4963 | goto force_balance; | 4993 | goto force_balance; |
| 4964 | 4994 | ||
| 4965 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4995 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| 4966 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4996 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
| 4967 | !sds.busiest_has_capacity) | 4997 | !busiest->group_has_capacity) |
| 4968 | goto force_balance; | 4998 | goto force_balance; |
| 4969 | 4999 | ||
| 4970 | /* | 5000 | /* |
| 4971 | * If the local group is more busy than the selected busiest group | 5001 | * If the local group is more busy than the selected busiest group |
| 4972 | * don't try and pull any tasks. | 5002 | * don't try and pull any tasks. |
| 4973 | */ | 5003 | */ |
| 4974 | if (sds.this_load >= sds.max_load) | 5004 | if (local->avg_load >= busiest->avg_load) |
| 4975 | goto out_balanced; | 5005 | goto out_balanced; |
| 4976 | 5006 | ||
| 4977 | /* | 5007 | /* |
| 4978 | * Don't pull any tasks if this group is already above the domain | 5008 | * Don't pull any tasks if this group is already above the domain |
| 4979 | * average load. | 5009 | * average load. |
| 4980 | */ | 5010 | */ |
| 4981 | if (sds.this_load >= sds.avg_load) | 5011 | if (local->avg_load >= sds.avg_load) |
| 4982 | goto out_balanced; | 5012 | goto out_balanced; |
| 4983 | 5013 | ||
| 4984 | if (env->idle == CPU_IDLE) { | 5014 | if (env->idle == CPU_IDLE) { |
| @@ -4988,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
| 4988 | * there is no imbalance between this and busiest group | 5018 | * there is no imbalance between this and busiest group |
| 4989 | * wrt to idle cpu's, it is balanced. | 5019 | * wrt to idle cpu's, it is balanced. |
| 4990 | */ | 5020 | */ |
| 4991 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5021 | if ((local->idle_cpus < busiest->idle_cpus) && |
| 4992 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5022 | busiest->sum_nr_running <= busiest->group_weight) |
| 4993 | goto out_balanced; | 5023 | goto out_balanced; |
| 4994 | } else { | 5024 | } else { |
| 4995 | /* | 5025 | /* |
| 4996 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5026 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
| 4997 | * imbalance_pct to be conservative. | 5027 | * imbalance_pct to be conservative. |
| 4998 | */ | 5028 | */ |
| 4999 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5029 | if (100 * busiest->avg_load <= |
| 5030 | env->sd->imbalance_pct * local->avg_load) | ||
| 5000 | goto out_balanced; | 5031 | goto out_balanced; |
| 5001 | } | 5032 | } |
| 5002 | 5033 | ||
| @@ -5006,7 +5037,6 @@ force_balance: | |||
| 5006 | return sds.busiest; | 5037 | return sds.busiest; |
| 5007 | 5038 | ||
| 5008 | out_balanced: | 5039 | out_balanced: |
| 5009 | ret: | ||
| 5010 | env->imbalance = 0; | 5040 | env->imbalance = 0; |
| 5011 | return NULL; | 5041 | return NULL; |
| 5012 | } | 5042 | } |
| @@ -5018,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 5018 | struct sched_group *group) | 5048 | struct sched_group *group) |
| 5019 | { | 5049 | { |
| 5020 | struct rq *busiest = NULL, *rq; | 5050 | struct rq *busiest = NULL, *rq; |
| 5021 | unsigned long max_load = 0; | 5051 | unsigned long busiest_load = 0, busiest_power = 1; |
| 5022 | int i; | 5052 | int i; |
| 5023 | 5053 | ||
| 5024 | for_each_cpu(i, sched_group_cpus(group)) { | 5054 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 5025 | unsigned long power = power_of(i); | 5055 | unsigned long power = power_of(i); |
| 5026 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5056 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
| 5027 | SCHED_POWER_SCALE); | 5057 | SCHED_POWER_SCALE); |
| @@ -5030,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 5030 | if (!capacity) | 5060 | if (!capacity) |
| 5031 | capacity = fix_small_capacity(env->sd, group); | 5061 | capacity = fix_small_capacity(env->sd, group); |
| 5032 | 5062 | ||
| 5033 | if (!cpumask_test_cpu(i, env->cpus)) | ||
| 5034 | continue; | ||
| 5035 | |||
| 5036 | rq = cpu_rq(i); | 5063 | rq = cpu_rq(i); |
| 5037 | wl = weighted_cpuload(i); | 5064 | wl = weighted_cpuload(i); |
| 5038 | 5065 | ||
| @@ -5048,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 5048 | * the weighted_cpuload() scaled with the cpu power, so that | 5075 | * the weighted_cpuload() scaled with the cpu power, so that |
| 5049 | * the load can be moved away from the cpu that is potentially | 5076 | * the load can be moved away from the cpu that is potentially |
| 5050 | * running at a lower capacity. | 5077 | * running at a lower capacity. |
| 5078 | * | ||
| 5079 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
| 5080 | * multiplication to rid ourselves of the division works out | ||
| 5081 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
| 5082 | * previous maximum. | ||
| 5051 | */ | 5083 | */ |
| 5052 | wl = (wl * SCHED_POWER_SCALE) / power; | 5084 | if (wl * busiest_power > busiest_load * power) { |
| 5053 | 5085 | busiest_load = wl; | |
| 5054 | if (wl > max_load) { | 5086 | busiest_power = power; |
| 5055 | max_load = wl; | ||
| 5056 | busiest = rq; | 5087 | busiest = rq; |
| 5057 | } | 5088 | } |
| 5058 | } | 5089 | } |
| @@ -5089,13 +5120,47 @@ static int need_active_balance(struct lb_env *env) | |||
| 5089 | 5120 | ||
| 5090 | static int active_load_balance_cpu_stop(void *data); | 5121 | static int active_load_balance_cpu_stop(void *data); |
| 5091 | 5122 | ||
| 5123 | static int should_we_balance(struct lb_env *env) | ||
| 5124 | { | ||
| 5125 | struct sched_group *sg = env->sd->groups; | ||
| 5126 | struct cpumask *sg_cpus, *sg_mask; | ||
| 5127 | int cpu, balance_cpu = -1; | ||
| 5128 | |||
| 5129 | /* | ||
| 5130 | * In the newly idle case, we will allow all the cpu's | ||
| 5131 | * to do the newly idle load balance. | ||
| 5132 | */ | ||
| 5133 | if (env->idle == CPU_NEWLY_IDLE) | ||
| 5134 | return 1; | ||
| 5135 | |||
| 5136 | sg_cpus = sched_group_cpus(sg); | ||
| 5137 | sg_mask = sched_group_mask(sg); | ||
| 5138 | /* Try to find first idle cpu */ | ||
| 5139 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
| 5140 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
| 5141 | continue; | ||
| 5142 | |||
| 5143 | balance_cpu = cpu; | ||
| 5144 | break; | ||
| 5145 | } | ||
| 5146 | |||
| 5147 | if (balance_cpu == -1) | ||
| 5148 | balance_cpu = group_balance_cpu(sg); | ||
| 5149 | |||
| 5150 | /* | ||
| 5151 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 5152 | * is eligible for doing load balancing at this and above domains. | ||
| 5153 | */ | ||
| 5154 | return balance_cpu != env->dst_cpu; | ||
| 5155 | } | ||
| 5156 | |||
| 5092 | /* | 5157 | /* |
| 5093 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5158 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 5094 | * tasks if there is an imbalance. | 5159 | * tasks if there is an imbalance. |
| 5095 | */ | 5160 | */ |
| 5096 | static int load_balance(int this_cpu, struct rq *this_rq, | 5161 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 5097 | struct sched_domain *sd, enum cpu_idle_type idle, | 5162 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 5098 | int *balance) | 5163 | int *continue_balancing) |
| 5099 | { | 5164 | { |
| 5100 | int ld_moved, cur_ld_moved, active_balance = 0; | 5165 | int ld_moved, cur_ld_moved, active_balance = 0; |
| 5101 | struct sched_group *group; | 5166 | struct sched_group *group; |
| @@ -5125,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5125 | schedstat_inc(sd, lb_count[idle]); | 5190 | schedstat_inc(sd, lb_count[idle]); |
| 5126 | 5191 | ||
| 5127 | redo: | 5192 | redo: |
| 5128 | group = find_busiest_group(&env, balance); | 5193 | if (!should_we_balance(&env)) { |
| 5129 | 5194 | *continue_balancing = 0; | |
| 5130 | if (*balance == 0) | ||
| 5131 | goto out_balanced; | 5195 | goto out_balanced; |
| 5196 | } | ||
| 5132 | 5197 | ||
| 5198 | group = find_busiest_group(&env); | ||
| 5133 | if (!group) { | 5199 | if (!group) { |
| 5134 | schedstat_inc(sd, lb_nobusyg[idle]); | 5200 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 5135 | goto out_balanced; | 5201 | goto out_balanced; |
| @@ -5341,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5341 | rcu_read_lock(); | 5407 | rcu_read_lock(); |
| 5342 | for_each_domain(this_cpu, sd) { | 5408 | for_each_domain(this_cpu, sd) { |
| 5343 | unsigned long interval; | 5409 | unsigned long interval; |
| 5344 | int balance = 1; | 5410 | int continue_balancing = 1; |
| 5345 | 5411 | ||
| 5346 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5412 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5347 | continue; | 5413 | continue; |
| @@ -5349,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5349 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5415 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 5350 | /* If we've pulled tasks over stop searching: */ | 5416 | /* If we've pulled tasks over stop searching: */ |
| 5351 | pulled_task = load_balance(this_cpu, this_rq, | 5417 | pulled_task = load_balance(this_cpu, this_rq, |
| 5352 | sd, CPU_NEWLY_IDLE, &balance); | 5418 | sd, CPU_NEWLY_IDLE, |
| 5419 | &continue_balancing); | ||
| 5353 | } | 5420 | } |
| 5354 | 5421 | ||
| 5355 | interval = msecs_to_jiffies(sd->balance_interval); | 5422 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -5587,7 +5654,7 @@ void update_max_interval(void) | |||
| 5587 | */ | 5654 | */ |
| 5588 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
| 5589 | { | 5656 | { |
| 5590 | int balance = 1; | 5657 | int continue_balancing = 1; |
| 5591 | struct rq *rq = cpu_rq(cpu); | 5658 | struct rq *rq = cpu_rq(cpu); |
| 5592 | unsigned long interval; | 5659 | unsigned long interval; |
| 5593 | struct sched_domain *sd; | 5660 | struct sched_domain *sd; |
| @@ -5619,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5619 | } | 5686 | } |
| 5620 | 5687 | ||
| 5621 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5688 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 5622 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5689 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
| 5623 | /* | 5690 | /* |
| 5624 | * The LBF_SOME_PINNED logic could have changed | 5691 | * The LBF_SOME_PINNED logic could have changed |
| 5625 | * env->dst_cpu, so we can't know our idle | 5692 | * env->dst_cpu, so we can't know our idle |
| @@ -5642,7 +5709,7 @@ out: | |||
| 5642 | * CPU in our sched group which is doing load balancing more | 5709 | * CPU in our sched group which is doing load balancing more |
| 5643 | * actively. | 5710 | * actively. |
| 5644 | */ | 5711 | */ |
| 5645 | if (!balance) | 5712 | if (!continue_balancing) |
| 5646 | break; | 5713 | break; |
| 5647 | } | 5714 | } |
| 5648 | rcu_read_unlock(); | 5715 | rcu_read_unlock(); |
| @@ -5938,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 5938 | * and ensure we don't carry in an old decay_count if we | 6005 | * and ensure we don't carry in an old decay_count if we |
| 5939 | * switch back. | 6006 | * switch back. |
| 5940 | */ | 6007 | */ |
| 5941 | if (p->se.avg.decay_count) { | 6008 | if (se->avg.decay_count) { |
| 5942 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6009 | __synchronize_entity_decay(se); |
| 5943 | __synchronize_entity_decay(&p->se); | 6010 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
| 5944 | subtract_blocked_load_contrib(cfs_rq, | ||
| 5945 | p->se.avg.load_avg_contrib); | ||
| 5946 | } | 6011 | } |
| 5947 | #endif | 6012 | #endif |
| 5948 | } | 6013 | } |
diff --git a/kernel/smp.c b/kernel/smp.c index b1c9034bdfcb..449b707fc20d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
| 186 | 186 | ||
| 187 | while (!list_empty(&list)) { | 187 | while (!list_empty(&list)) { |
| 188 | struct call_single_data *csd; | 188 | struct call_single_data *csd; |
| 189 | unsigned int csd_flags; | ||
| 190 | 189 | ||
| 191 | csd = list_entry(list.next, struct call_single_data, list); | 190 | csd = list_entry(list.next, struct call_single_data, list); |
| 192 | list_del(&csd->list); | 191 | list_del(&csd->list); |
| 193 | 192 | ||
| 194 | /* | ||
| 195 | * 'csd' can be invalid after this call if flags == 0 | ||
| 196 | * (when called through generic_exec_single()), | ||
| 197 | * so save them away before making the call: | ||
| 198 | */ | ||
| 199 | csd_flags = csd->flags; | ||
| 200 | |||
| 201 | csd->func(csd->info); | 193 | csd->func(csd->info); |
| 202 | 194 | ||
| 203 | /* | 195 | csd_unlock(csd); |
| 204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
| 205 | */ | ||
| 206 | if (csd_flags & CSD_FLAG_LOCK) | ||
| 207 | csd_unlock(csd); | ||
| 208 | } | 196 | } |
| 209 | } | 197 | } |
| 210 | 198 | ||
