aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2013-08-15 14:29:29 -0400
committerIngo Molnar <mingo@kernel.org>2013-09-02 02:27:38 -0400
commit30ce5dabc92b5a349a7d9e9cf499494d230e0691 (patch)
tree89c316645f6ef3a60c821e0d7a5d3e01379cbdf7 /kernel
parent6906a40839198f33dbb56d20e644c01e00663952 (diff)
sched/fair: Rework and comment the group_imb code
Rik reported some weirdness due to the group_imb code. As a start to looking at it, clean it up a little and add a few explanatory comments. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c123
1 files changed, 89 insertions, 34 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bedd30b168a5..dffb27070ddb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4463 return 0; 4463 return 0;
4464} 4464}
4465 4465
4466/*
4467 * Group imbalance indicates (and tries to solve) the problem where balancing
4468 * groups is inadequate due to tsk_cpus_allowed() constraints.
4469 *
4470 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4471 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4472 * Something like:
4473 *
4474 * { 0 1 2 3 } { 4 5 6 7 }
4475 * * * * *
4476 *
4477 * If we were to balance group-wise we'd place two tasks in the first group and
4478 * two tasks in the second group. Clearly this is undesired as it will overload
4479 * cpu 3 and leave one of the cpus in the second group unused.
4480 *
4481 * The current solution to this issue is detecting the skew in the first group
4482 * by noticing it has a cpu that is overloaded while the remaining cpus are
4483 * idle -- or rather, there's a distinct imbalance in the cpus; see
4484 * sg_imbalanced().
4485 *
4486 * When this is so detected; this group becomes a candidate for busiest; see
4487 * update_sd_pick_busiest(). And calculcate_imbalance() and
4488 * find_busiest_group() avoid some of the usual balance conditional to allow it
4489 * to create an effective group imbalance.
4490 *
4491 * This is a somewhat tricky proposition since the next run might not find the
4492 * group imbalance and decide the groups need to be balanced again. A most
4493 * subtle and fragile situation.
4494 */
4495
4496struct sg_imb_stats {
4497 unsigned long max_nr_running, min_nr_running;
4498 unsigned long max_cpu_load, min_cpu_load;
4499};
4500
4501static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4502{
4503 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4504 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4505}
4506
4507static inline void
4508update_sg_imb_stats(struct sg_imb_stats *sgi,
4509 unsigned long load, unsigned long nr_running)
4510{
4511 if (load > sgi->max_cpu_load)
4512 sgi->max_cpu_load = load;
4513 if (sgi->min_cpu_load > load)
4514 sgi->min_cpu_load = load;
4515
4516 if (nr_running > sgi->max_nr_running)
4517 sgi->max_nr_running = nr_running;
4518 if (sgi->min_nr_running > nr_running)
4519 sgi->min_nr_running = nr_running;
4520}
4521
4522static inline int
4523sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4524{
4525 /*
4526 * Consider the group unbalanced when the imbalance is larger
4527 * than the average weight of a task.
4528 *
4529 * APZ: with cgroup the avg task weight can vary wildly and
4530 * might not be a suitable number - should we keep a
4531 * normalized nr_running number somewhere that negates
4532 * the hierarchy?
4533 */
4534 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4535 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4536 return 1;
4537
4538 return 0;
4539}
4540
4466/** 4541/**
4467 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4542 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4468 * @env: The load balancing environment. 4543 * @env: The load balancing environment.
@@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4475 struct sched_group *group, int load_idx, 4550 struct sched_group *group, int load_idx,
4476 int local_group, struct sg_lb_stats *sgs) 4551 int local_group, struct sg_lb_stats *sgs)
4477{ 4552{
4478 unsigned long nr_running, max_nr_running, min_nr_running; 4553 struct sg_imb_stats sgi;
4479 unsigned long load, max_cpu_load, min_cpu_load; 4554 unsigned long nr_running;
4555 unsigned long load;
4480 int i; 4556 int i;
4481 4557
4482 /* Tally up the load of all CPUs in the group */ 4558 init_sg_imb_stats(&sgi);
4483 max_cpu_load = 0;
4484 min_cpu_load = ~0UL;
4485 max_nr_running = 0;
4486 min_nr_running = ~0UL;
4487 4559
4488 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4560 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4489 struct rq *rq = cpu_rq(i); 4561 struct rq *rq = cpu_rq(i);
@@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4495 load = target_load(i, load_idx); 4567 load = target_load(i, load_idx);
4496 } else { 4568 } else {
4497 load = source_load(i, load_idx); 4569 load = source_load(i, load_idx);
4498 4570 update_sg_imb_stats(&sgi, load, nr_running);
4499 if (load > max_cpu_load)
4500 max_cpu_load = load;
4501 if (min_cpu_load > load)
4502 min_cpu_load = load;
4503
4504 if (nr_running > max_nr_running)
4505 max_nr_running = nr_running;
4506 if (min_nr_running > nr_running)
4507 min_nr_running = nr_running;
4508 } 4571 }
4509 4572
4510 sgs->group_load += load; 4573 sgs->group_load += load;
@@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4522 sgs->group_power = group->sgp->power; 4585 sgs->group_power = group->sgp->power;
4523 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 4586 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4524 4587
4525 /*
4526 * Consider the group unbalanced when the imbalance is larger
4527 * than the average weight of a task.
4528 *
4529 * APZ: with cgroup the avg task weight can vary wildly and
4530 * might not be a suitable number - should we keep a
4531 * normalized nr_running number somewhere that negates
4532 * the hierarchy?
4533 */
4534 if (sgs->sum_nr_running) 4588 if (sgs->sum_nr_running)
4535 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4589 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4536 4590
4537 if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && 4591 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4538 (max_nr_running - min_nr_running) > 1)
4539 sgs->group_imb = 1;
4540 4592
4541 sgs->group_capacity = 4593 sgs->group_capacity =
4542 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); 4594 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
@@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4781 busiest = &sds->busiest_stat; 4833 busiest = &sds->busiest_stat;
4782 4834
4783 if (busiest->group_imb) { 4835 if (busiest->group_imb) {
4836 /*
4837 * In the group_imb case we cannot rely on group-wide averages
4838 * to ensure cpu-load equilibrium, look at wider averages. XXX
4839 */
4784 busiest->load_per_task = 4840 busiest->load_per_task =
4785 min(busiest->load_per_task, sds->avg_load); 4841 min(busiest->load_per_task, sds->avg_load);
4786 } 4842 }
@@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4798 if (!busiest->group_imb) { 4854 if (!busiest->group_imb) {
4799 /* 4855 /*
4800 * Don't want to pull so many tasks that a group would go idle. 4856 * Don't want to pull so many tasks that a group would go idle.
4857 * Except of course for the group_imb case, since then we might
4858 * have to drop below capacity to reach cpu-load equilibrium.
4801 */ 4859 */
4802 load_above_capacity = 4860 load_above_capacity =
4803 (busiest->sum_nr_running - busiest->group_capacity); 4861 (busiest->sum_nr_running - busiest->group_capacity);
@@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4813 * we also don't want to reduce the group load below the group capacity 4871 * we also don't want to reduce the group load below the group capacity
4814 * (so that we can implement power-savings policies etc). Thus we look 4872 * (so that we can implement power-savings policies etc). Thus we look
4815 * for the minimum possible imbalance. 4873 * for the minimum possible imbalance.
4816 * Be careful of negative numbers as they'll appear as very large values
4817 * with unsigned longs.
4818 */ 4874 */
4819 max_pull = min(busiest->avg_load - sds->avg_load, 4875 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4820 load_above_capacity);
4821 4876
4822 /* How much load to actually move to equalise the imbalance */ 4877 /* How much load to actually move to equalise the imbalance */
4823 env->imbalance = min( 4878 env->imbalance = min(
@@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
4881 4936
4882 /* 4937 /*
4883 * If the busiest group is imbalanced the below checks don't 4938 * If the busiest group is imbalanced the below checks don't
4884 * work because they assumes all things are equal, which typically 4939 * work because they assume all things are equal, which typically
4885 * isn't true due to cpus_allowed constraints and the like. 4940 * isn't true due to cpus_allowed constraints and the like.
4886 */ 4941 */
4887 if (busiest->group_imb) 4942 if (busiest->group_imb)