sched/core: Use load_avg for selecting idlest group

find_idlest_group() only compares the runnable_load_avg when looking for the least loaded group. But on fork intensive use case like hackbench where tasks blocked quickly after the fork, this can lead to selecting the same CPU instead of other CPUs, which have similar runnable load but a lower load_avg. When the runnable_load_avg of 2 CPUs are close, we now take into account the amount of blocked load as a 2nd selection factor. There is now 3 zones for the runnable_load of the rq: - [0 .. (runnable_load - imbalance)]: Select the new rq which has significantly less runnable_load - [(runnable_load - imbalance) .. (runnable_load + imbalance)]: The runnable loads are close so we use load_avg to chose between the 2 rq - [(runnable_load + imbalance) .. ULONG_MAX]: Keep the current rq which has significantly less runnable_load The scale factor that is currently used for comparing runnable_load, doesn't work well with small value. As an example, the use of a scaling factor fails as soon as this_runnable_load == 0 because we always select local rq even if min_runnable_load is only 1, which doesn't really make sense because they are just the same. So instead of scaling factor, we use an absolute margin for runnable_load to detect CPUs with similar runnable_load and we keep using scaling factor for blocked load. For use case like hackbench, this enable the scheduler to select different CPUs during the fork sequence and to spread tasks across the system. Tests have been done on a Hikey board (ARM based octo cores) for several kernel. The result below gives min, max, avg and stdev values of 18 runs with each configuration. The patches depend on the "no missing update_rq_clock()" work. hackbench -P -g 1 ea86cb4b7621 7dc603c9028e v4.8 v4.8+patches min 0.049 0.050 0.051 0,048 avg 0.057 0.057(0%) 0.057(0%) 0,055(+5%) max 0.066 0.068 0.070 0,063 stdev +/-9% +/-9% +/-8% +/-9% More performance numbers here: https://lkml.kernel.org/r/20161203214707.GI20785@codeblueprint.co.uk Tested-by: Matt Fleming <matt@codeblueprint.co.uk> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Vincent Guittot <vincent.guittot@linaro.org> 2016-12-08 11:56:54 -0500
committer: Ingo Molnar <mingo@kernel.org> 2016-12-11 07:10:57 -0500
commit: 6b94780e45c17b83e3e75f8aaca5a328db583c74 (patch)
tree: b372b075bbc4ab12ecf4443670667c295b90c80b /kernel/sched/fair.c
parent: f519a3f1c6b7a990e5aed37a8f853c6ecfdee945 (diff)
1 files changed, 44 insertions, 11 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ebb815f6bda7..6559d197e08a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5405,16 +5405,20 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
        struct sched_group *most_spare_sg = NULL;
-        unsigned long min_load = ULONG_MAX, this_load = 0;
+        unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+        unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
        unsigned long most_spare = 0, this_spare = 0;
        int load_idx = sd->forkexec_idx;
-        int imbalance = 100 + (sd->imbalance_pct-100)/2;
+        int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+        unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+                                (sd->imbalance_pct-100) / 100;
        if (sd_flag & SD_BALANCE_WAKE)
                load_idx = sd->wake_idx;
        do {
-                unsigned long load, avg_load, spare_cap, max_spare_cap;
+                unsigned long load, avg_load, runnable_load;
+                unsigned long spare_cap, max_spare_cap;
                int local_group;
                int i;
@@ -5431,6 +5435,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 * the group containing the CPU with most spare capacity.
                 */
                avg_load = 0;
+                runnable_load = 0;
                max_spare_cap = 0;
                for_each_cpu(i, sched_group_cpus(group)) {
@@ -5440,7 +5445,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                        else
                                load = target_load(i, load_idx);
-                        avg_load += load;
+                        runnable_load += load;
+                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
                        spare_cap = capacity_spare_wake(i, p);
@@ -5449,14 +5456,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                }
                /* Adjust by relative CPU capacity of the group */
-                avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+                avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+                                        group->sgc->capacity;
+                runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+                                        group->sgc->capacity;
                if (local_group) {
-                        this_load = avg_load;
+                        this_runnable_load = runnable_load;
+                        this_avg_load = avg_load;
                        this_spare = max_spare_cap;
                } else {
-                        if (avg_load < min_load) {
+                        if (min_runnable_load > (runnable_load + imbalance)) {
-                                min_load = avg_load;
+                                /*
+                                 * The runnable load is significantly smaller
+                                 * so we can pick this new cpu
+                                 */
+                                min_runnable_load = runnable_load;
+                                min_avg_load = avg_load;
+                                idlest = group;
+                        } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+                                   (100*min_avg_load > imbalance_scale*avg_load)) {
+                                /*
+                                 * The runnable loads are close so take the
+                                 * blocked load into account through avg_load.
+                                 */
+                                min_avg_load = avg_load;
                                idlest = group;
                        }
@@ -5482,14 +5506,23 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                goto skip_spare;
        if (this_spare > task_util(p) / 2 &&
-            imbalance*this_spare > 100*most_spare)
+            imbalance_scale*this_spare > 100*most_spare)
                return NULL;
-        else if (most_spare > task_util(p) / 2)
+        if (most_spare > task_util(p) / 2)
                return most_spare_sg;
 skip_spare:
-        if (!idlest || 100*this_load < imbalance*min_load)
+        if (!idlest)
+                return NULL;
+        if (min_runnable_load > (this_runnable_load + imbalance))
                return NULL;
+        if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+             (100*this_avg_load < imbalance_scale*min_avg_load))
+                return NULL;
        return idlest;
 }
author	Vincent Guittot <vincent.guittot@linaro.org>	2016-12-08 11:56:54 -0500
committer	Ingo Molnar <mingo@kernel.org>	2016-12-11 07:10:57 -0500
commit	6b94780e45c17b83e3e75f8aaca5a328db583c74 (patch)
tree	b372b075bbc4ab12ecf4443670667c295b90c80b /kernel/sched/fair.c
parent	f519a3f1c6b7a990e5aed37a8f853c6ecfdee945 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebb815f6bda7..6559d197e08a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -5405,16 +5405,20 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5405	{	5405	{
5406	struct sched_group idlest = NULL, group = sd->groups;	5406	struct sched_group idlest = NULL, group = sd->groups;
5407	struct sched_group *most_spare_sg = NULL;	5407	struct sched_group *most_spare_sg = NULL;
5408	unsigned long min_load = ULONG_MAX, this_load = 0;	5408	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
		5409	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
5409	unsigned long most_spare = 0, this_spare = 0;	5410	unsigned long most_spare = 0, this_spare = 0;
5410	int load_idx = sd->forkexec_idx;	5411	int load_idx = sd->forkexec_idx;
5411	int imbalance = 100 + (sd->imbalance_pct-100)/2;	5412	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
		5413	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
		5414	(sd->imbalance_pct-100) / 100;
5412		5415
5413	if (sd_flag & SD_BALANCE_WAKE)	5416	if (sd_flag & SD_BALANCE_WAKE)
5414	load_idx = sd->wake_idx;	5417	load_idx = sd->wake_idx;
5415		5418
5416	do {	5419	do {
5417	unsigned long load, avg_load, spare_cap, max_spare_cap;	5420	unsigned long load, avg_load, runnable_load;
		5421	unsigned long spare_cap, max_spare_cap;
5418	int local_group;	5422	int local_group;
5419	int i;	5423	int i;
5420		5424
@@ -5431,6 +5435,7 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5431	* the group containing the CPU with most spare capacity.	5435	* the group containing the CPU with most spare capacity.
5432	*/	5436	*/
5433	avg_load = 0;	5437	avg_load = 0;
		5438	runnable_load = 0;
5434	max_spare_cap = 0;	5439	max_spare_cap = 0;
5435		5440
5436	for_each_cpu(i, sched_group_cpus(group)) {	5441	for_each_cpu(i, sched_group_cpus(group)) {
@@ -5440,7 +5445,9 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5440	else	5445	else
5441	load = target_load(i, load_idx);	5446	load = target_load(i, load_idx);
5442		5447
5443	avg_load += load;	5448	runnable_load += load;
		5449
		5450	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5444		5451
5445	spare_cap = capacity_spare_wake(i, p);	5452	spare_cap = capacity_spare_wake(i, p);
5446		5453
@@ -5449,14 +5456,31 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5449	}	5456	}
5450		5457
5451	/* Adjust by relative CPU capacity of the group */	5458	/* Adjust by relative CPU capacity of the group */
5452	avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;	5459	avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
		5460	group->sgc->capacity;
		5461	runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
		5462	group->sgc->capacity;
5453		5463
5454	if (local_group) {	5464	if (local_group) {
5455	this_load = avg_load;	5465	this_runnable_load = runnable_load;
		5466	this_avg_load = avg_load;
5456	this_spare = max_spare_cap;	5467	this_spare = max_spare_cap;
5457	} else {	5468	} else {
5458	if (avg_load < min_load) {	5469	if (min_runnable_load > (runnable_load + imbalance)) {
5459	min_load = avg_load;	5470	/*
		5471	* The runnable load is significantly smaller
		5472	* so we can pick this new cpu
		5473	*/
		5474	min_runnable_load = runnable_load;
		5475	min_avg_load = avg_load;
		5476	idlest = group;
		5477	} else if ((runnable_load < (min_runnable_load + imbalance)) &&
		5478	(100min_avg_load > imbalance_scaleavg_load)) {
		5479	/*
		5480	* The runnable loads are close so take the
		5481	* blocked load into account through avg_load.
		5482	*/
		5483	min_avg_load = avg_load;
5460	idlest = group;	5484	idlest = group;
5461	}	5485	}
5462		5486
@@ -5482,14 +5506,23 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5482	goto skip_spare;	5506	goto skip_spare;
5483		5507
5484	if (this_spare > task_util(p) / 2 &&	5508	if (this_spare > task_util(p) / 2 &&
5485	imbalancethis_spare > 100most_spare)	5509	imbalance_scalethis_spare > 100most_spare)
5486	return NULL;	5510	return NULL;
5487	else if (most_spare > task_util(p) / 2)	5511
		5512	if (most_spare > task_util(p) / 2)
5488	return most_spare_sg;	5513	return most_spare_sg;
5489		5514
5490	skip_spare:	5515	skip_spare:
5491	if (!idlest \|\| 100this_load < imbalancemin_load)	5516	if (!idlest)
		5517	return NULL;
		5518
		5519	if (min_runnable_load > (this_runnable_load + imbalance))
5492	return NULL;	5520	return NULL;
		5521
		5522	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
		5523	(100this_avg_load < imbalance_scalemin_avg_load))
		5524	return NULL;
		5525
5493	return idlest;	5526	return idlest;
5494	}	5527	}
5495		5528