summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2016-12-08 11:56:54 -0500
committerIngo Molnar <mingo@kernel.org>2016-12-11 07:10:57 -0500
commit6b94780e45c17b83e3e75f8aaca5a328db583c74 (patch)
treeb372b075bbc4ab12ecf4443670667c295b90c80b /kernel/sched/fair.c
parentf519a3f1c6b7a990e5aed37a8f853c6ecfdee945 (diff)
sched/core: Use load_avg for selecting idlest group
find_idlest_group() only compares the runnable_load_avg when looking for the least loaded group. But on fork intensive use case like hackbench where tasks blocked quickly after the fork, this can lead to selecting the same CPU instead of other CPUs, which have similar runnable load but a lower load_avg. When the runnable_load_avg of 2 CPUs are close, we now take into account the amount of blocked load as a 2nd selection factor. There is now 3 zones for the runnable_load of the rq: - [0 .. (runnable_load - imbalance)]: Select the new rq which has significantly less runnable_load - [(runnable_load - imbalance) .. (runnable_load + imbalance)]: The runnable loads are close so we use load_avg to chose between the 2 rq - [(runnable_load + imbalance) .. ULONG_MAX]: Keep the current rq which has significantly less runnable_load The scale factor that is currently used for comparing runnable_load, doesn't work well with small value. As an example, the use of a scaling factor fails as soon as this_runnable_load == 0 because we always select local rq even if min_runnable_load is only 1, which doesn't really make sense because they are just the same. So instead of scaling factor, we use an absolute margin for runnable_load to detect CPUs with similar runnable_load and we keep using scaling factor for blocked load. For use case like hackbench, this enable the scheduler to select different CPUs during the fork sequence and to spread tasks across the system. Tests have been done on a Hikey board (ARM based octo cores) for several kernel. The result below gives min, max, avg and stdev values of 18 runs with each configuration. The patches depend on the "no missing update_rq_clock()" work. hackbench -P -g 1 ea86cb4b7621 7dc603c9028e v4.8 v4.8+patches min 0.049 0.050 0.051 0,048 avg 0.057 0.057(0%) 0.057(0%) 0,055(+5%) max 0.066 0.068 0.070 0,063 stdev +/-9% +/-9% +/-8% +/-9% More performance numbers here: https://lkml.kernel.org/r/20161203214707.GI20785@codeblueprint.co.uk Tested-by: Matt Fleming <matt@codeblueprint.co.uk> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c55
1 files changed, 44 insertions, 11 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ebb815f6bda7..6559d197e08a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5405,16 +5405,20 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5405{ 5405{
5406 struct sched_group *idlest = NULL, *group = sd->groups; 5406 struct sched_group *idlest = NULL, *group = sd->groups;
5407 struct sched_group *most_spare_sg = NULL; 5407 struct sched_group *most_spare_sg = NULL;
5408 unsigned long min_load = ULONG_MAX, this_load = 0; 5408 unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
5409 unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
5409 unsigned long most_spare = 0, this_spare = 0; 5410 unsigned long most_spare = 0, this_spare = 0;
5410 int load_idx = sd->forkexec_idx; 5411 int load_idx = sd->forkexec_idx;
5411 int imbalance = 100 + (sd->imbalance_pct-100)/2; 5412 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5413 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5414 (sd->imbalance_pct-100) / 100;
5412 5415
5413 if (sd_flag & SD_BALANCE_WAKE) 5416 if (sd_flag & SD_BALANCE_WAKE)
5414 load_idx = sd->wake_idx; 5417 load_idx = sd->wake_idx;
5415 5418
5416 do { 5419 do {
5417 unsigned long load, avg_load, spare_cap, max_spare_cap; 5420 unsigned long load, avg_load, runnable_load;
5421 unsigned long spare_cap, max_spare_cap;
5418 int local_group; 5422 int local_group;
5419 int i; 5423 int i;
5420 5424
@@ -5431,6 +5435,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5431 * the group containing the CPU with most spare capacity. 5435 * the group containing the CPU with most spare capacity.
5432 */ 5436 */
5433 avg_load = 0; 5437 avg_load = 0;
5438 runnable_load = 0;
5434 max_spare_cap = 0; 5439 max_spare_cap = 0;
5435 5440
5436 for_each_cpu(i, sched_group_cpus(group)) { 5441 for_each_cpu(i, sched_group_cpus(group)) {
@@ -5440,7 +5445,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5440 else 5445 else
5441 load = target_load(i, load_idx); 5446 load = target_load(i, load_idx);
5442 5447
5443 avg_load += load; 5448 runnable_load += load;
5449
5450 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5444 5451
5445 spare_cap = capacity_spare_wake(i, p); 5452 spare_cap = capacity_spare_wake(i, p);
5446 5453
@@ -5449,14 +5456,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5449 } 5456 }
5450 5457
5451 /* Adjust by relative CPU capacity of the group */ 5458 /* Adjust by relative CPU capacity of the group */
5452 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; 5459 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5460 group->sgc->capacity;
5461 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5462 group->sgc->capacity;
5453 5463
5454 if (local_group) { 5464 if (local_group) {
5455 this_load = avg_load; 5465 this_runnable_load = runnable_load;
5466 this_avg_load = avg_load;
5456 this_spare = max_spare_cap; 5467 this_spare = max_spare_cap;
5457 } else { 5468 } else {
5458 if (avg_load < min_load) { 5469 if (min_runnable_load > (runnable_load + imbalance)) {
5459 min_load = avg_load; 5470 /*
5471 * The runnable load is significantly smaller
5472 * so we can pick this new cpu
5473 */
5474 min_runnable_load = runnable_load;
5475 min_avg_load = avg_load;
5476 idlest = group;
5477 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5478 (100*min_avg_load > imbalance_scale*avg_load)) {
5479 /*
5480 * The runnable loads are close so take the
5481 * blocked load into account through avg_load.
5482 */
5483 min_avg_load = avg_load;
5460 idlest = group; 5484 idlest = group;
5461 } 5485 }
5462 5486
@@ -5482,14 +5506,23 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5482 goto skip_spare; 5506 goto skip_spare;
5483 5507
5484 if (this_spare > task_util(p) / 2 && 5508 if (this_spare > task_util(p) / 2 &&
5485 imbalance*this_spare > 100*most_spare) 5509 imbalance_scale*this_spare > 100*most_spare)
5486 return NULL; 5510 return NULL;
5487 else if (most_spare > task_util(p) / 2) 5511
5512 if (most_spare > task_util(p) / 2)
5488 return most_spare_sg; 5513 return most_spare_sg;
5489 5514
5490skip_spare: 5515skip_spare:
5491 if (!idlest || 100*this_load < imbalance*min_load) 5516 if (!idlest)
5517 return NULL;
5518
5519 if (min_runnable_load > (this_runnable_load + imbalance))
5492 return NULL; 5520 return NULL;
5521
5522 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5523 (100*this_avg_load < imbalance_scale*min_avg_load))
5524 return NULL;
5525
5493 return idlest; 5526 return idlest;
5494} 5527}
5495 5528