sched/fair: Rework and comment the group_imb code

Rik reported some weirdness due to the group_imb code. As a start to looking at it, clean it up a little and add a few explanatory comments. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2013-08-15 14:29:29 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-09-02 02:27:38 -0400
commit: 30ce5dabc92b5a349a7d9e9cf499494d230e0691 (patch)
tree: 89c316645f6ef3a60c821e0d7a5d3e01379cbdf7 /kernel
parent: 6906a40839198f33dbb56d20e644c01e00663952 (diff)
1 files changed, 89 insertions, 34 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bedd30b168a5..dffb27070ddb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
        return 0;
 }
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ *      { 0 1 2 3 } { 4 5 6 7 }
+ *              *     * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing it has a cpu that is overloaded while the remaining cpus are
+ * idle -- or rather, there's a distinct imbalance in the cpus; see
+ * sg_imbalanced().
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculcate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+struct sg_imb_stats {
+        unsigned long max_nr_running, min_nr_running;
+        unsigned long max_cpu_load, min_cpu_load;
+};
+static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+{
+        sgi->max_cpu_load = sgi->max_nr_running = 0UL;
+        sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
+}
+static inline void
+update_sg_imb_stats(struct sg_imb_stats *sgi,
+                    unsigned long load, unsigned long nr_running)
+{
+        if (load > sgi->max_cpu_load)
+                sgi->max_cpu_load = load;
+        if (sgi->min_cpu_load > load)
+                sgi->min_cpu_load = load;
+        if (nr_running > sgi->max_nr_running)
+                sgi->max_nr_running = nr_running;
+        if (sgi->min_nr_running > nr_running)
+                sgi->min_nr_running = nr_running;
+}
+static inline int
+sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+{
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of a task.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
+            (sgi->max_nr_running - sgi->min_nr_running) > 1)
+                return 1;
+        return 0;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
@@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-        unsigned long nr_running, max_nr_running, min_nr_running;
+        struct sg_imb_stats sgi;
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long nr_running;
+        unsigned long load;
        int i;
-        /* Tally up the load of all CPUs in the group */
+        init_sg_imb_stats(&sgi);
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        max_nr_running = 0;
-        min_nr_running = ~0UL;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
@@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
+                        update_sg_imb_stats(&sgi, load, nr_running);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                        if (nr_running > max_nr_running)
-                                max_nr_running = nr_running;
-                        if (min_nr_running > nr_running)
-                                min_nr_running = nr_running;
                }
                sgs->group_load += load;
@@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        sgs->group_power = group->sgp->power;
        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task &&
+        sgs->group_imb = sg_imbalanced(sgs, &sgi);
-            (max_nr_running - min_nr_running) > 1)
-                sgs->group_imb = 1;
        sgs->group_capacity =
                DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
@@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        busiest = &sds->busiest_stat;
        if (busiest->group_imb) {
+                /*
+                 * In the group_imb case we cannot rely on group-wide averages
+                 * to ensure cpu-load equilibrium, look at wider averages. XXX
+                 */
                busiest->load_per_task =
                        min(busiest->load_per_task, sds->avg_load);
        }
@@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        if (!busiest->group_imb) {
                /*
                 * Don't want to pull so many tasks that a group would go idle.
+                 * Except of course for the group_imb case, since then we might
+                 * have to drop below capacity to reach cpu-load equilibrium.
                 */
                load_above_capacity =
                        (busiest->sum_nr_running - busiest->group_capacity);
@@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * we also don't want to reduce the group load below the group capacity
         * (so that we can implement power-savings policies etc). Thus we look
         * for the minimum possible imbalance.
-         * Be careful of negative numbers as they'll appear as very large values
-         * with unsigned longs.
         */
-        max_pull = min(busiest->avg_load - sds->avg_load,
+        max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-                       load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
        env->imbalance = min(
@@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        /*
         * If the busiest group is imbalanced the below checks don't
-         * work because they assumes all things are equal, which typically
+         * work because they assume all things are equal, which typically
         * isn't true due to cpus_allowed constraints and the like.
         */
        if (busiest->group_imb)
author	Peter Zijlstra <peterz@infradead.org>	2013-08-15 14:29:29 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-09-02 02:27:38 -0400
commit	30ce5dabc92b5a349a7d9e9cf499494d230e0691 (patch)
tree	89c316645f6ef3a60c821e0d7a5d3e01379cbdf7 /kernel
parent	6906a40839198f33dbb56d20e644c01e00663952 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bedd30b168a5..dffb27070ddb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain sd, struct sched_group group)
4463	return 0;	4463	return 0;
4464	}	4464	}
4465		4465
		4466	/*
		4467	* Group imbalance indicates (and tries to solve) the problem where balancing
		4468	* groups is inadequate due to tsk_cpus_allowed() constraints.
		4469	*
		4470	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
		4471	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
		4472	* Something like:
		4473	*
		4474	* { 0 1 2 3 } { 4 5 6 7 }
		4475	* * * * *
		4476	*
		4477	* If we were to balance group-wise we'd place two tasks in the first group and
		4478	* two tasks in the second group. Clearly this is undesired as it will overload
		4479	* cpu 3 and leave one of the cpus in the second group unused.
		4480	*
		4481	* The current solution to this issue is detecting the skew in the first group
		4482	* by noticing it has a cpu that is overloaded while the remaining cpus are
		4483	* idle -- or rather, there's a distinct imbalance in the cpus; see
		4484	* sg_imbalanced().
		4485	*
		4486	* When this is so detected; this group becomes a candidate for busiest; see
		4487	* update_sd_pick_busiest(). And calculcate_imbalance() and
		4488	* find_busiest_group() avoid some of the usual balance conditional to allow it
		4489	* to create an effective group imbalance.
		4490	*
		4491	* This is a somewhat tricky proposition since the next run might not find the
		4492	* group imbalance and decide the groups need to be balanced again. A most
		4493	* subtle and fragile situation.
		4494	*/
		4495
		4496	struct sg_imb_stats {
		4497	unsigned long max_nr_running, min_nr_running;
		4498	unsigned long max_cpu_load, min_cpu_load;
		4499	};
		4500
		4501	static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
		4502	{
		4503	sgi->max_cpu_load = sgi->max_nr_running = 0UL;
		4504	sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
		4505	}
		4506
		4507	static inline void
		4508	update_sg_imb_stats(struct sg_imb_stats *sgi,
		4509	unsigned long load, unsigned long nr_running)
		4510	{
		4511	if (load > sgi->max_cpu_load)
		4512	sgi->max_cpu_load = load;
		4513	if (sgi->min_cpu_load > load)
		4514	sgi->min_cpu_load = load;
		4515
		4516	if (nr_running > sgi->max_nr_running)
		4517	sgi->max_nr_running = nr_running;
		4518	if (sgi->min_nr_running > nr_running)
		4519	sgi->min_nr_running = nr_running;
		4520	}
		4521
		4522	static inline int
		4523	sg_imbalanced(struct sg_lb_stats sgs, struct sg_imb_stats sgi)
		4524	{
		4525	/*
		4526	* Consider the group unbalanced when the imbalance is larger
		4527	* than the average weight of a task.
		4528	*
		4529	* APZ: with cgroup the avg task weight can vary wildly and
		4530	* might not be a suitable number - should we keep a
		4531	* normalized nr_running number somewhere that negates
		4532	* the hierarchy?
		4533	*/
		4534	if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
		4535	(sgi->max_nr_running - sgi->min_nr_running) > 1)
		4536	return 1;
		4537
		4538	return 0;
		4539	}
		4540
4466	/**	4541	/**
4467	* update_sg_lb_stats - Update sched_group's statistics for load balancing.	4542	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
4468	* @env: The load balancing environment.	4543	* @env: The load balancing environment.
@@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4475	struct sched_group *group, int load_idx,	4550	struct sched_group *group, int load_idx,
4476	int local_group, struct sg_lb_stats *sgs)	4551	int local_group, struct sg_lb_stats *sgs)
4477	{	4552	{
4478	unsigned long nr_running, max_nr_running, min_nr_running;	4553	struct sg_imb_stats sgi;
4479	unsigned long load, max_cpu_load, min_cpu_load;	4554	unsigned long nr_running;
		4555	unsigned long load;
4480	int i;	4556	int i;
4481		4557
4482	/* Tally up the load of all CPUs in the group */	4558	init_sg_imb_stats(&sgi);
4483	max_cpu_load = 0;
4484	min_cpu_load = ~0UL;
4485	max_nr_running = 0;
4486	min_nr_running = ~0UL;
4487		4559
4488	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {	4560	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4489	struct rq *rq = cpu_rq(i);	4561	struct rq *rq = cpu_rq(i);
@@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4495	load = target_load(i, load_idx);	4567	load = target_load(i, load_idx);
4496	} else {	4568	} else {
4497	load = source_load(i, load_idx);	4569	load = source_load(i, load_idx);
4498		4570	update_sg_imb_stats(&sgi, load, nr_running);
4499	if (load > max_cpu_load)
4500	max_cpu_load = load;
4501	if (min_cpu_load > load)
4502	min_cpu_load = load;
4503
4504	if (nr_running > max_nr_running)
4505	max_nr_running = nr_running;
4506	if (min_nr_running > nr_running)
4507	min_nr_running = nr_running;
4508	}	4571	}
4509		4572
4510	sgs->group_load += load;	4573	sgs->group_load += load;
@@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4522	sgs->group_power = group->sgp->power;	4585	sgs->group_power = group->sgp->power;
4523	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;	4586	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4524		4587
4525	/*
4526	* Consider the group unbalanced when the imbalance is larger
4527	* than the average weight of a task.
4528	*
4529	* APZ: with cgroup the avg task weight can vary wildly and
4530	* might not be a suitable number - should we keep a
4531	* normalized nr_running number somewhere that negates
4532	* the hierarchy?
4533	*/
4534	if (sgs->sum_nr_running)	4588	if (sgs->sum_nr_running)
4535	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;	4589	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4536		4590
4537	if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task &&	4591	sgs->group_imb = sg_imbalanced(sgs, &sgi);
4538	(max_nr_running - min_nr_running) > 1)
4539	sgs->group_imb = 1;
4540		4592
4541	sgs->group_capacity =	4593	sgs->group_capacity =
4542	DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);	4594	DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
@@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
4781	busiest = &sds->busiest_stat;	4833	busiest = &sds->busiest_stat;
4782		4834
4783	if (busiest->group_imb) {	4835	if (busiest->group_imb) {
		4836	/*
		4837	* In the group_imb case we cannot rely on group-wide averages
		4838	* to ensure cpu-load equilibrium, look at wider averages. XXX
		4839	*/
4784	busiest->load_per_task =	4840	busiest->load_per_task =
4785	min(busiest->load_per_task, sds->avg_load);	4841	min(busiest->load_per_task, sds->avg_load);
4786	}	4842	}
@@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
4798	if (!busiest->group_imb) {	4854	if (!busiest->group_imb) {
4799	/*	4855	/*
4800	* Don't want to pull so many tasks that a group would go idle.	4856	* Don't want to pull so many tasks that a group would go idle.
		4857	* Except of course for the group_imb case, since then we might
		4858	* have to drop below capacity to reach cpu-load equilibrium.
4801	*/	4859	*/
4802	load_above_capacity =	4860	load_above_capacity =
4803	(busiest->sum_nr_running - busiest->group_capacity);	4861	(busiest->sum_nr_running - busiest->group_capacity);
@@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats s
4813	* we also don't want to reduce the group load below the group capacity	4871	* we also don't want to reduce the group load below the group capacity
4814	* (so that we can implement power-savings policies etc). Thus we look	4872	* (so that we can implement power-savings policies etc). Thus we look
4815	* for the minimum possible imbalance.	4873	* for the minimum possible imbalance.
4816	* Be careful of negative numbers as they'll appear as very large values
4817	* with unsigned longs.
4818	*/	4874	*/
4819	max_pull = min(busiest->avg_load - sds->avg_load,	4875	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4820	load_above_capacity);
4821		4876
4822	/* How much load to actually move to equalise the imbalance */	4877	/* How much load to actually move to equalise the imbalance */
4823	env->imbalance = min(	4878	env->imbalance = min(
@@ -4881,7 +4936,7 @@ static struct sched_group find_busiest_group(struct lb_env env)
4881		4936
4882	/*	4937	/*
4883	* If the busiest group is imbalanced the below checks don't	4938	* If the busiest group is imbalanced the below checks don't
4884	* work because they assumes all things are equal, which typically	4939	* work because they assume all things are equal, which typically
4885	* isn't true due to cpus_allowed constraints and the like.	4940	* isn't true due to cpus_allowed constraints and the like.
4886	*/	4941	*/
4887	if (busiest->group_imb)	4942	if (busiest->group_imb)