diff options
| author | Peter Zijlstra <peterz@infradead.org> | 2013-08-15 14:29:29 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2013-09-02 02:27:38 -0400 |
| commit | 30ce5dabc92b5a349a7d9e9cf499494d230e0691 (patch) | |
| tree | 89c316645f6ef3a60c821e0d7a5d3e01379cbdf7 /kernel | |
| parent | 6906a40839198f33dbb56d20e644c01e00663952 (diff) | |
sched/fair: Rework and comment the group_imb code
Rik reported some weirdness due to the group_imb code. As a start to
looking at it, clean it up a little and add a few explanatory
comments.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/fair.c | 123 |
1 files changed, 89 insertions, 34 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bedd30b168a5..dffb27070ddb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4463 | return 0; | 4463 | return 0; |
| 4464 | } | 4464 | } |
| 4465 | 4465 | ||
| 4466 | /* | ||
| 4467 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
| 4468 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
| 4469 | * | ||
| 4470 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
| 4471 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
| 4472 | * Something like: | ||
| 4473 | * | ||
| 4474 | * { 0 1 2 3 } { 4 5 6 7 } | ||
| 4475 | * * * * * | ||
| 4476 | * | ||
| 4477 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
| 4478 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
| 4479 | * cpu 3 and leave one of the cpus in the second group unused. | ||
| 4480 | * | ||
| 4481 | * The current solution to this issue is detecting the skew in the first group | ||
| 4482 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
| 4483 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
| 4484 | * sg_imbalanced(). | ||
| 4485 | * | ||
| 4486 | * When this is so detected; this group becomes a candidate for busiest; see | ||
| 4487 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
| 4488 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
| 4489 | * to create an effective group imbalance. | ||
| 4490 | * | ||
| 4491 | * This is a somewhat tricky proposition since the next run might not find the | ||
| 4492 | * group imbalance and decide the groups need to be balanced again. A most | ||
| 4493 | * subtle and fragile situation. | ||
| 4494 | */ | ||
| 4495 | |||
| 4496 | struct sg_imb_stats { | ||
| 4497 | unsigned long max_nr_running, min_nr_running; | ||
| 4498 | unsigned long max_cpu_load, min_cpu_load; | ||
| 4499 | }; | ||
| 4500 | |||
| 4501 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
| 4502 | { | ||
| 4503 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
| 4504 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
| 4505 | } | ||
| 4506 | |||
| 4507 | static inline void | ||
| 4508 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
| 4509 | unsigned long load, unsigned long nr_running) | ||
| 4510 | { | ||
| 4511 | if (load > sgi->max_cpu_load) | ||
| 4512 | sgi->max_cpu_load = load; | ||
| 4513 | if (sgi->min_cpu_load > load) | ||
| 4514 | sgi->min_cpu_load = load; | ||
| 4515 | |||
| 4516 | if (nr_running > sgi->max_nr_running) | ||
| 4517 | sgi->max_nr_running = nr_running; | ||
| 4518 | if (sgi->min_nr_running > nr_running) | ||
| 4519 | sgi->min_nr_running = nr_running; | ||
| 4520 | } | ||
| 4521 | |||
| 4522 | static inline int | ||
| 4523 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
| 4524 | { | ||
| 4525 | /* | ||
| 4526 | * Consider the group unbalanced when the imbalance is larger | ||
| 4527 | * than the average weight of a task. | ||
| 4528 | * | ||
| 4529 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4530 | * might not be a suitable number - should we keep a | ||
| 4531 | * normalized nr_running number somewhere that negates | ||
| 4532 | * the hierarchy? | ||
| 4533 | */ | ||
| 4534 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
| 4535 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
| 4536 | return 1; | ||
| 4537 | |||
| 4538 | return 0; | ||
| 4539 | } | ||
| 4540 | |||
| 4466 | /** | 4541 | /** |
| 4467 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4542 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 4468 | * @env: The load balancing environment. | 4543 | * @env: The load balancing environment. |
| @@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4475 | struct sched_group *group, int load_idx, | 4550 | struct sched_group *group, int load_idx, |
| 4476 | int local_group, struct sg_lb_stats *sgs) | 4551 | int local_group, struct sg_lb_stats *sgs) |
| 4477 | { | 4552 | { |
| 4478 | unsigned long nr_running, max_nr_running, min_nr_running; | 4553 | struct sg_imb_stats sgi; |
| 4479 | unsigned long load, max_cpu_load, min_cpu_load; | 4554 | unsigned long nr_running; |
| 4555 | unsigned long load; | ||
| 4480 | int i; | 4556 | int i; |
| 4481 | 4557 | ||
| 4482 | /* Tally up the load of all CPUs in the group */ | 4558 | init_sg_imb_stats(&sgi); |
| 4483 | max_cpu_load = 0; | ||
| 4484 | min_cpu_load = ~0UL; | ||
| 4485 | max_nr_running = 0; | ||
| 4486 | min_nr_running = ~0UL; | ||
| 4487 | 4559 | ||
| 4488 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4560 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4489 | struct rq *rq = cpu_rq(i); | 4561 | struct rq *rq = cpu_rq(i); |
| @@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4495 | load = target_load(i, load_idx); | 4567 | load = target_load(i, load_idx); |
| 4496 | } else { | 4568 | } else { |
| 4497 | load = source_load(i, load_idx); | 4569 | load = source_load(i, load_idx); |
| 4498 | 4570 | update_sg_imb_stats(&sgi, load, nr_running); | |
| 4499 | if (load > max_cpu_load) | ||
| 4500 | max_cpu_load = load; | ||
| 4501 | if (min_cpu_load > load) | ||
| 4502 | min_cpu_load = load; | ||
| 4503 | |||
| 4504 | if (nr_running > max_nr_running) | ||
| 4505 | max_nr_running = nr_running; | ||
| 4506 | if (min_nr_running > nr_running) | ||
| 4507 | min_nr_running = nr_running; | ||
| 4508 | } | 4571 | } |
| 4509 | 4572 | ||
| 4510 | sgs->group_load += load; | 4573 | sgs->group_load += load; |
| @@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4522 | sgs->group_power = group->sgp->power; | 4585 | sgs->group_power = group->sgp->power; |
| 4523 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 4586 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; |
| 4524 | 4587 | ||
| 4525 | /* | ||
| 4526 | * Consider the group unbalanced when the imbalance is larger | ||
| 4527 | * than the average weight of a task. | ||
| 4528 | * | ||
| 4529 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4530 | * might not be a suitable number - should we keep a | ||
| 4531 | * normalized nr_running number somewhere that negates | ||
| 4532 | * the hierarchy? | ||
| 4533 | */ | ||
| 4534 | if (sgs->sum_nr_running) | 4588 | if (sgs->sum_nr_running) |
| 4535 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4589 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 4536 | 4590 | ||
| 4537 | if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && | 4591 | sgs->group_imb = sg_imbalanced(sgs, &sgi); |
| 4538 | (max_nr_running - min_nr_running) > 1) | ||
| 4539 | sgs->group_imb = 1; | ||
| 4540 | 4592 | ||
| 4541 | sgs->group_capacity = | 4593 | sgs->group_capacity = |
| 4542 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | 4594 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); |
| @@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4781 | busiest = &sds->busiest_stat; | 4833 | busiest = &sds->busiest_stat; |
| 4782 | 4834 | ||
| 4783 | if (busiest->group_imb) { | 4835 | if (busiest->group_imb) { |
| 4836 | /* | ||
| 4837 | * In the group_imb case we cannot rely on group-wide averages | ||
| 4838 | * to ensure cpu-load equilibrium, look at wider averages. XXX | ||
| 4839 | */ | ||
| 4784 | busiest->load_per_task = | 4840 | busiest->load_per_task = |
| 4785 | min(busiest->load_per_task, sds->avg_load); | 4841 | min(busiest->load_per_task, sds->avg_load); |
| 4786 | } | 4842 | } |
| @@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4798 | if (!busiest->group_imb) { | 4854 | if (!busiest->group_imb) { |
| 4799 | /* | 4855 | /* |
| 4800 | * Don't want to pull so many tasks that a group would go idle. | 4856 | * Don't want to pull so many tasks that a group would go idle. |
| 4857 | * Except of course for the group_imb case, since then we might | ||
| 4858 | * have to drop below capacity to reach cpu-load equilibrium. | ||
| 4801 | */ | 4859 | */ |
| 4802 | load_above_capacity = | 4860 | load_above_capacity = |
| 4803 | (busiest->sum_nr_running - busiest->group_capacity); | 4861 | (busiest->sum_nr_running - busiest->group_capacity); |
| @@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4813 | * we also don't want to reduce the group load below the group capacity | 4871 | * we also don't want to reduce the group load below the group capacity |
| 4814 | * (so that we can implement power-savings policies etc). Thus we look | 4872 | * (so that we can implement power-savings policies etc). Thus we look |
| 4815 | * for the minimum possible imbalance. | 4873 | * for the minimum possible imbalance. |
| 4816 | * Be careful of negative numbers as they'll appear as very large values | ||
| 4817 | * with unsigned longs. | ||
| 4818 | */ | 4874 | */ |
| 4819 | max_pull = min(busiest->avg_load - sds->avg_load, | 4875 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
| 4820 | load_above_capacity); | ||
| 4821 | 4876 | ||
| 4822 | /* How much load to actually move to equalise the imbalance */ | 4877 | /* How much load to actually move to equalise the imbalance */ |
| 4823 | env->imbalance = min( | 4878 | env->imbalance = min( |
| @@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 4881 | 4936 | ||
| 4882 | /* | 4937 | /* |
| 4883 | * If the busiest group is imbalanced the below checks don't | 4938 | * If the busiest group is imbalanced the below checks don't |
| 4884 | * work because they assumes all things are equal, which typically | 4939 | * work because they assume all things are equal, which typically |
| 4885 | * isn't true due to cpus_allowed constraints and the like. | 4940 | * isn't true due to cpus_allowed constraints and the like. |
| 4886 | */ | 4941 | */ |
| 4887 | if (busiest->group_imb) | 4942 | if (busiest->group_imb) |
