summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorQuentin Perret <quentin.perret@arm.com>2019-09-12 05:44:04 -0400
committerIngo Molnar <mingo@kernel.org>2019-09-13 01:45:17 -0400
commiteb92692b2544d3f415887dbbc98499843dfe568b (patch)
treed1542b308184900dad99eb6545e23207421cc61d /kernel/sched
parent0413d7f33e60751570fd6c179546bde2f7d82dcb (diff)
sched/fair: Speed-up energy-aware wake-ups
EAS computes the energy impact of migrating a waking task when deciding on which CPU it should run. However, the current approach is known to have a high algorithmic complexity, which can result in prohibitively high wake-up latencies on systems with complex energy models, such as systems with per-CPU DVFS. On such systems, the algorithm complexity is in O(n^2) (ignoring the cost of searching for performance states in the EM) with 'n' the number of CPUs. To address this, re-factor the EAS wake-up path to compute the energy 'delta' (with and without the task) on a per-performance domain basis, rather than system-wide, which brings the complexity down to O(n). No functional changes intended. Test results ~~~~~~~~~~~~ * Setup: Tested on a Google Pixel 3, with a Snapdragon 845 (4+4 CPUs, A55/A75). Base kernel is 5.3-rc5 + Pixel3 specific patches. Android userspace, no graphics. * Test case: Run a periodic rt-app task, with 16ms period, ramping down from 70% to 10%, in 5% steps of 500 ms each (json avail. at [1]). Frequencies of all CPUs are pinned to max (using scaling_min_freq CPUFreq sysfs entries) to reduce variability. The time to run select_task_rq_fair() is measured using the function profiler (/sys/kernel/debug/tracing/trace_stat/function*). See the test script for more details [2]. Test 1: I hacked the DT to 'fake' per-CPU DVFS. That is, we end up with one CPUFreq policy per CPU (8 policies in total). Since all frequencies are pinned to max for the test, this should have no impact on the actual frequency selection, but it does in the EAS calculation. +---------------------------+----------------------------------+ | Without patch | With patch | +-----+-----+----------+----------+-----+-----------------+----------+ | CPU | Hit | Avg (us) | s^2 (us) | Hit | Avg (us) | s^2 (us) | |-----+-----+----------+----------+-----+-----------------+----------+ | 0 | 274 | 38.303 | 1750.239 | 401 | 14.126 (-63.1%) | 146.625 | | 1 | 197 | 49.529 | 1695.852 | 314 | 16.135 (-67.4%) | 167.525 | | 2 | 142 | 34.296 | 1758.665 | 302 | 14.133 (-58.8%) | 130.071 | | 3 | 172 | 31.734 | 1490.975 | 641 | 14.637 (-53.9%) | 139.189 | | 4 | 316 | 7.834 | 178.217 | 425 | 5.413 (-30.9%) | 20.803 | | 5 | 447 | 8.424 | 144.638 | 556 | 5.929 (-29.6%) | 27.301 | | 6 | 581 | 14.886 | 346.793 | 456 | 5.711 (-61.6%) | 23.124 | | 7 | 456 | 10.005 | 211.187 | 997 | 4.708 (-52.9%) | 21.144 | +-----+-----+----------+----------+-----+-----------------+----------+ * Hit, Avg and s^2 are as reported by the function profiler Test 2: I also ran the same test with a normal DT, with 2 CPUFreq policies, to see if this causes regressions in the most common case. +---------------------------+----------------------------------+ | Without patch | With patch | +-----+-----+----------+----------+-----+-----------------+----------+ | CPU | Hit | Avg (us) | s^2 (us) | Hit | Avg (us) | s^2 (us) | |-----+-----+----------+----------+-----+-----------------+----------+ | 0 | 345 | 22.184 | 215.321 | 580 | 18.635 (-16.0%) | 146.892 | | 1 | 358 | 18.597 | 200.596 | 438 | 12.934 (-30.5%) | 104.604 | | 2 | 359 | 25.566 | 200.217 | 397 | 10.826 (-57.7%) | 74.021 | | 3 | 362 | 16.881 | 200.291 | 718 | 11.455 (-32.1%) | 102.280 | | 4 | 457 | 3.822 | 9.895 | 757 | 4.616 (+20.8%) | 13.369 | | 5 | 344 | 4.301 | 7.121 | 594 | 5.320 (+23.7%) | 18.798 | | 6 | 472 | 4.326 | 7.849 | 464 | 5.648 (+30.6%) | 22.022 | | 7 | 331 | 4.630 | 13.937 | 408 | 5.299 (+14.4%) | 18.273 | +-----+-----+----------+----------+-----+-----------------+----------+ * Hit, Avg and s^2 are as reported by the function profiler In addition to these two tests, I also ran 50 iterations of the Lisa EAS functional test suite [3] with this patch applied on Arm Juno r0, Arm Juno r2, Arm TC2 and Hikey960, and could not see any regressions (all EAS functional tests are passing). [1] https://paste.debian.net/1100055/ [2] https://paste.debian.net/1100057/ [3] https://github.com/ARM-software/lisa/blob/master/lisa/tests/scheduler/eas_behaviour.py Signed-off-by: Quentin Perret <quentin.perret@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Cc: qperret@qperret.net Cc: rjw@rjwysocki.net Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190912094404.13802-1-qperret@qperret.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c110
1 files changed, 50 insertions, 60 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1054d2cf6aaa..8b665110a44a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6251,69 +6251,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6251} 6251}
6252 6252
6253/* 6253/*
6254 * compute_energy(): Estimates the energy that would be consumed if @p was 6254 * compute_energy(): Estimates the energy that @pd would consume if @p was
6255 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization 6255 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6256 * landscape of the * CPUs after the task migration, and uses the Energy Model 6256 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
6257 * to compute what would be the energy if we decided to actually migrate that 6257 * to compute what would be the energy if we decided to actually migrate that
6258 * task. 6258 * task.
6259 */ 6259 */
6260static long 6260static long
6261compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) 6261compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6262{ 6262{
6263 unsigned int max_util, util_cfs, cpu_util, cpu_cap; 6263 struct cpumask *pd_mask = perf_domain_span(pd);
6264 unsigned long sum_util, energy = 0; 6264 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6265 struct task_struct *tsk; 6265 unsigned long max_util = 0, sum_util = 0;
6266 int cpu; 6266 int cpu;
6267 6267
6268 for (; pd; pd = pd->next) { 6268 /*
6269 struct cpumask *pd_mask = perf_domain_span(pd); 6269 * The capacity state of CPUs of the current rd can be driven by CPUs
6270 * of another rd if they belong to the same pd. So, account for the
6271 * utilization of these CPUs too by masking pd with cpu_online_mask
6272 * instead of the rd span.
6273 *
6274 * If an entire pd is outside of the current rd, it will not appear in
6275 * its pd list and will not be accounted by compute_energy().
6276 */
6277 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6278 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6279 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6270 6280
6271 /* 6281 /*
6272 * The energy model mandates all the CPUs of a performance 6282 * Busy time computation: utilization clamping is not
6273 * domain have the same capacity. 6283 * required since the ratio (sum_util / cpu_capacity)
6284 * is already enough to scale the EM reported power
6285 * consumption at the (eventually clamped) cpu_capacity.
6274 */ 6286 */
6275 cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); 6287 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6276 max_util = sum_util = 0; 6288 ENERGY_UTIL, NULL);
6277 6289
6278 /* 6290 /*
6279 * The capacity state of CPUs of the current rd can be driven by 6291 * Performance domain frequency: utilization clamping
6280 * CPUs of another rd if they belong to the same performance 6292 * must be considered since it affects the selection
6281 * domain. So, account for the utilization of these CPUs too 6293 * of the performance domain frequency.
6282 * by masking pd with cpu_online_mask instead of the rd span. 6294 * NOTE: in case RT tasks are running, by default the
6283 * 6295 * FREQUENCY_UTIL's utilization can be max OPP.
6284 * If an entire performance domain is outside of the current rd,
6285 * it will not appear in its pd list and will not be accounted
6286 * by compute_energy().
6287 */ 6296 */
6288 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { 6297 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6289 util_cfs = cpu_util_next(cpu, p, dst_cpu); 6298 FREQUENCY_UTIL, tsk);
6290 6299 max_util = max(max_util, cpu_util);
6291 /*
6292 * Busy time computation: utilization clamping is not
6293 * required since the ratio (sum_util / cpu_capacity)
6294 * is already enough to scale the EM reported power
6295 * consumption at the (eventually clamped) cpu_capacity.
6296 */
6297 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6298 ENERGY_UTIL, NULL);
6299
6300 /*
6301 * Performance domain frequency: utilization clamping
6302 * must be considered since it affects the selection
6303 * of the performance domain frequency.
6304 * NOTE: in case RT tasks are running, by default the
6305 * FREQUENCY_UTIL's utilization can be max OPP.
6306 */
6307 tsk = cpu == dst_cpu ? p : NULL;
6308 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6309 FREQUENCY_UTIL, tsk);
6310 max_util = max(max_util, cpu_util);
6311 }
6312
6313 energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6314 } 6300 }
6315 6301
6316 return energy; 6302 return em_pd_energy(pd->em_pd, max_util, sum_util);
6317} 6303}
6318 6304
6319/* 6305/*
@@ -6355,21 +6341,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6355 * other use-cases too. So, until someone finds a better way to solve this, 6341 * other use-cases too. So, until someone finds a better way to solve this,
6356 * let's keep things simple by re-using the existing slow path. 6342 * let's keep things simple by re-using the existing slow path.
6357 */ 6343 */
6358
6359static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) 6344static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6360{ 6345{
6361 unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; 6346 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6362 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 6347 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6348 unsigned long cpu_cap, util, base_energy = 0;
6363 int cpu, best_energy_cpu = prev_cpu; 6349 int cpu, best_energy_cpu = prev_cpu;
6364 struct perf_domain *head, *pd;
6365 unsigned long cpu_cap, util;
6366 struct sched_domain *sd; 6350 struct sched_domain *sd;
6351 struct perf_domain *pd;
6367 6352
6368 rcu_read_lock(); 6353 rcu_read_lock();
6369 pd = rcu_dereference(rd->pd); 6354 pd = rcu_dereference(rd->pd);
6370 if (!pd || READ_ONCE(rd->overutilized)) 6355 if (!pd || READ_ONCE(rd->overutilized))
6371 goto fail; 6356 goto fail;
6372 head = pd;
6373 6357
6374 /* 6358 /*
6375 * Energy-aware wake-up happens on the lowest sched_domain starting 6359 * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6386,9 +6370,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6386 goto unlock; 6370 goto unlock;
6387 6371
6388 for (; pd; pd = pd->next) { 6372 for (; pd; pd = pd->next) {
6389 unsigned long cur_energy, spare_cap, max_spare_cap = 0; 6373 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6374 unsigned long base_energy_pd;
6390 int max_spare_cap_cpu = -1; 6375 int max_spare_cap_cpu = -1;
6391 6376
6377 /* Compute the 'base' energy of the pd, without @p */
6378 base_energy_pd = compute_energy(p, -1, pd);
6379 base_energy += base_energy_pd;
6380
6392 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { 6381 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6393 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 6382 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6394 continue; 6383 continue;
@@ -6401,9 +6390,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6401 6390
6402 /* Always use prev_cpu as a candidate. */ 6391 /* Always use prev_cpu as a candidate. */
6403 if (cpu == prev_cpu) { 6392 if (cpu == prev_cpu) {
6404 prev_energy = compute_energy(p, prev_cpu, head); 6393 prev_delta = compute_energy(p, prev_cpu, pd);
6405 best_energy = min(best_energy, prev_energy); 6394 prev_delta -= base_energy_pd;
6406 continue; 6395 best_delta = min(best_delta, prev_delta);
6407 } 6396 }
6408 6397
6409 /* 6398 /*
@@ -6419,9 +6408,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6419 6408
6420 /* Evaluate the energy impact of using this CPU. */ 6409 /* Evaluate the energy impact of using this CPU. */
6421 if (max_spare_cap_cpu >= 0) { 6410 if (max_spare_cap_cpu >= 0) {
6422 cur_energy = compute_energy(p, max_spare_cap_cpu, head); 6411 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6423 if (cur_energy < best_energy) { 6412 cur_delta -= base_energy_pd;
6424 best_energy = cur_energy; 6413 if (cur_delta < best_delta) {
6414 best_delta = cur_delta;
6425 best_energy_cpu = max_spare_cap_cpu; 6415 best_energy_cpu = max_spare_cap_cpu;
6426 } 6416 }
6427 } 6417 }
@@ -6433,10 +6423,10 @@ unlock:
6433 * Pick the best CPU if prev_cpu cannot be used, or if it saves at 6423 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6434 * least 6% of the energy used by prev_cpu. 6424 * least 6% of the energy used by prev_cpu.
6435 */ 6425 */
6436 if (prev_energy == ULONG_MAX) 6426 if (prev_delta == ULONG_MAX)
6437 return best_energy_cpu; 6427 return best_energy_cpu;
6438 6428
6439 if ((prev_energy - best_energy) > (prev_energy >> 4)) 6429 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6440 return best_energy_cpu; 6430 return best_energy_cpu;
6441 6431
6442 return prev_cpu; 6432 return prev_cpu;