aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2019-01-23 10:26:53 -0500
committerIngo Molnar <mingo@kernel.org>2019-02-04 03:13:21 -0500
commit23127296889fe84b0762b191b5d041e8ba6f2599 (patch)
treec9ea109b8c2fff0158bacf7d776dec3c93502932
parent62478d9911fab9694c195f0ca8e4701de09be98e (diff)
sched/fair: Update scale invariance of PELT
The current implementation of load tracking invariance scales the contribution with current frequency and uarch performance (only for utilization) of the CPU. One main result of this formula is that the figures are capped by current capacity of CPU. Another one is that the load_avg is not invariant because not scaled with uarch. The util_avg of a periodic task that runs r time slots every p time slots varies in the range : U * (1-y^r)/(1-y^p) * y^i < Utilization < U * (1-y^r)/(1-y^p) with U is the max util_avg value = SCHED_CAPACITY_SCALE At a lower capacity, the range becomes: U * C * (1-y^r')/(1-y^p) * y^i' < Utilization < U * C * (1-y^r')/(1-y^p) with C reflecting the compute capacity ratio between current capacity and max capacity. so C tries to compensate changes in (1-y^r') but it can't be accurate. Instead of scaling the contribution value of PELT algo, we should scale the running time. The PELT signal aims to track the amount of computation of tasks and/or rq so it seems more correct to scale the running time to reflect the effective amount of computation done since the last update. In order to be fully invariant, we need to apply the same amount of running time and idle time whatever the current capacity. Because running at lower capacity implies that the task will run longer, we have to ensure that the same amount of idle time will be applied when system becomes idle and no idle time has been "stolen". But reaching the maximum utilization value (SCHED_CAPACITY_SCALE) means that the task is seen as an always-running task whatever the capacity of the CPU (even at max compute capacity). In this case, we can discard this "stolen" idle times which becomes meaningless. In order to achieve this time scaling, a new clock_pelt is created per rq. The increase of this clock scales with current capacity when something is running on rq and synchronizes with clock_task when rq is idle. With this mechanism, we ensure the same running and idle time whatever the current capacity. This also enables to simplify the pelt algorithm by removing all references of uarch and frequency and applying the same contribution to utilization and loads. Furthermore, the scaling is done only once per update of clock (update_rq_clock_task()) instead of during each update of sched_entities and cfs/rt/dl_rq of the rq like the current implementation. This is interesting when cgroup are involved as shown in the results below: On a hikey (octo Arm64 platform). Performance cpufreq governor and only shallowest c-state to remove variance generated by those power features so we only track the impact of pelt algo. each test runs 16 times: ./perf bench sched pipe (higher is better) kernel tip/sched/core + patch ops/seconds ops/seconds diff cgroup root 59652(+/- 0.18%) 59876(+/- 0.24%) +0.38% level1 55608(+/- 0.27%) 55923(+/- 0.24%) +0.57% level2 52115(+/- 0.29%) 52564(+/- 0.22%) +0.86% hackbench -l 1000 (lower is better) kernel tip/sched/core + patch duration(sec) duration(sec) diff cgroup root 4.453(+/- 2.37%) 4.383(+/- 2.88%) -1.57% level1 4.859(+/- 8.50%) 4.830(+/- 7.07%) -0.60% level2 5.063(+/- 9.83%) 4.928(+/- 9.66%) -2.66% Then, the responsiveness of PELT is improved when CPU is not running at max capacity with this new algorithm. I have put below some examples of duration to reach some typical load values according to the capacity of the CPU with current implementation and with this patch. These values has been computed based on the geometric series and the half period value: Util (%) max capacity half capacity(mainline) half capacity(w/ patch) 972 (95%) 138ms not reachable 276ms 486 (47.5%) 30ms 138ms 60ms 256 (25%) 13ms 32ms 26ms On my hikey (octo Arm64 platform) with schedutil governor, the time to reach max OPP when starting from a null utilization, decreases from 223ms with current scale invariance down to 121ms with the new algorithm. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: patrick.bellasi@arm.com Cc: pjt@google.com Cc: pkondeti@codeaurora.org Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Link: https://lkml.kernel.org/r/1548257214-13745-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h23
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c45
-rw-r--r--kernel/sched/pelt.c45
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h5
8 files changed, 177 insertions, 68 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 628bf13cb5a5..351c0fe64c85 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -357,12 +357,6 @@ struct util_est {
357 * For cfs_rq, it is the aggregated load_avg of all runnable and 357 * For cfs_rq, it is the aggregated load_avg of all runnable and
358 * blocked sched_entities. 358 * blocked sched_entities.
359 * 359 *
360 * load_avg may also take frequency scaling into account:
361 *
362 * load_avg = runnable% * scale_load_down(load) * freq%
363 *
364 * where freq% is the CPU frequency normalized to the highest frequency.
365 *
366 * [util_avg definition] 360 * [util_avg definition]
367 * 361 *
368 * util_avg = running% * SCHED_CAPACITY_SCALE 362 * util_avg = running% * SCHED_CAPACITY_SCALE
@@ -371,17 +365,14 @@ struct util_est {
371 * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable 365 * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
372 * and blocked sched_entities. 366 * and blocked sched_entities.
373 * 367 *
374 * util_avg may also factor frequency scaling and CPU capacity scaling: 368 * load_avg and util_avg don't direcly factor frequency scaling and CPU
375 * 369 * capacity scaling. The scaling is done through the rq_clock_pelt that
376 * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% 370 * is used for computing those signals (see update_rq_clock_pelt())
377 *
378 * where freq% is the same as above, and capacity% is the CPU capacity
379 * normalized to the greatest capacity (due to uarch differences, etc).
380 * 371 *
381 * N.B., the above ratios (runnable%, running%, freq%, and capacity%) 372 * N.B., the above ratios (runnable% and running%) themselves are in the
382 * themselves are in the range of [0, 1]. To do fixed point arithmetics, 373 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
383 * we therefore scale them to as large a range as necessary. This is for 374 * to as large a range as necessary. This is for example reflected by
384 * example reflected by util_avg's SCHED_CAPACITY_SCALE. 375 * util_avg's SCHED_CAPACITY_SCALE.
385 * 376 *
386 * [Overflow issue] 377 * [Overflow issue]
387 * 378 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a674c7db2f29..32e06704565e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -180,6 +180,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
180 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 180 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
181 update_irq_load_avg(rq, irq_delta + steal); 181 update_irq_load_avg(rq, irq_delta + steal);
182#endif 182#endif
183 update_rq_clock_pelt(rq, delta);
183} 184}
184 185
185void update_rq_clock(struct rq *rq) 186void update_rq_clock(struct rq *rq)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb8b7b5d745d..6a73e41a2016 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1767 deadline_queue_push_tasks(rq); 1767 deadline_queue_push_tasks(rq);
1768 1768
1769 if (rq->curr->sched_class != &dl_sched_class) 1769 if (rq->curr->sched_class != &dl_sched_class)
1770 update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); 1770 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1771 1771
1772 return p; 1772 return p;
1773} 1773}
@@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776{ 1776{
1777 update_curr_dl(rq); 1777 update_curr_dl(rq);
1778 1778
1779 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); 1779 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1780 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 1780 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1781 enqueue_pushable_dl_task(rq, p); 1781 enqueue_pushable_dl_task(rq, p);
1782} 1782}
@@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1793{ 1793{
1794 update_curr_dl(rq); 1794 update_curr_dl(rq);
1795 1795
1796 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); 1796 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1797 /* 1797 /*
1798 * Even when we have runtime, update_curr_dl() might have resulted in us 1798 * Even when we have runtime, update_curr_dl() might have resulted in us
1799 * not being the leftmost task anymore. In that case NEED_RESCHED will 1799 * not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da13e834e990..f41f2eec6186 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -673,9 +673,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
673 return calc_delta_fair(sched_slice(cfs_rq, se), se); 673 return calc_delta_fair(sched_slice(cfs_rq, se), se);
674} 674}
675 675
676#ifdef CONFIG_SMP
677#include "pelt.h" 676#include "pelt.h"
678#include "sched-pelt.h" 677#ifdef CONFIG_SMP
679 678
680static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 679static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
681static unsigned long task_h_load(struct task_struct *p); 680static unsigned long task_h_load(struct task_struct *p);
@@ -763,7 +762,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
763 * such that the next switched_to_fair() has the 762 * such that the next switched_to_fair() has the
764 * expected state. 763 * expected state.
765 */ 764 */
766 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); 765 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
767 return; 766 return;
768 } 767 }
769 } 768 }
@@ -3109,7 +3108,7 @@ void set_task_rq_fair(struct sched_entity *se,
3109 p_last_update_time = prev->avg.last_update_time; 3108 p_last_update_time = prev->avg.last_update_time;
3110 n_last_update_time = next->avg.last_update_time; 3109 n_last_update_time = next->avg.last_update_time;
3111#endif 3110#endif
3112 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); 3111 __update_load_avg_blocked_se(p_last_update_time, se);
3113 se->avg.last_update_time = n_last_update_time; 3112 se->avg.last_update_time = n_last_update_time;
3114} 3113}
3115 3114
@@ -3244,11 +3243,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3244 3243
3245 /* 3244 /*
3246 * runnable_sum can't be lower than running_sum 3245 * runnable_sum can't be lower than running_sum
3247 * As running sum is scale with CPU capacity wehreas the runnable sum 3246 * Rescale running sum to be in the same range as runnable sum
3248 * is not we rescale running_sum 1st 3247 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3248 * runnable_sum is in [0 : LOAD_AVG_MAX]
3249 */ 3249 */
3250 running_sum = se->avg.util_sum / 3250 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3251 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3252 runnable_sum = max(runnable_sum, running_sum); 3251 runnable_sum = max(runnable_sum, running_sum);
3253 3252
3254 load_sum = (s64)se_weight(se) * runnable_sum; 3253 load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3351,7 +3350,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
3351 3350
3352/** 3351/**
3353 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages 3352 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3354 * @now: current time, as per cfs_rq_clock_task() 3353 * @now: current time, as per cfs_rq_clock_pelt()
3355 * @cfs_rq: cfs_rq to update 3354 * @cfs_rq: cfs_rq to update
3356 * 3355 *
3357 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3356 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3396,7 +3395,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3396 decayed = 1; 3395 decayed = 1;
3397 } 3396 }
3398 3397
3399 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3398 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3400 3399
3401#ifndef CONFIG_64BIT 3400#ifndef CONFIG_64BIT
3402 smp_wmb(); 3401 smp_wmb();
@@ -3486,9 +3485,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3486/* Update task and its cfs_rq load average */ 3485/* Update task and its cfs_rq load average */
3487static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3486static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3488{ 3487{
3489 u64 now = cfs_rq_clock_task(cfs_rq); 3488 u64 now = cfs_rq_clock_pelt(cfs_rq);
3490 struct rq *rq = rq_of(cfs_rq);
3491 int cpu = cpu_of(rq);
3492 int decayed; 3489 int decayed;
3493 3490
3494 /* 3491 /*
@@ -3496,7 +3493,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3496 * track group sched_entity load average for task_h_load calc in migration 3493 * track group sched_entity load average for task_h_load calc in migration
3497 */ 3494 */
3498 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) 3495 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3499 __update_load_avg_se(now, cpu, cfs_rq, se); 3496 __update_load_avg_se(now, cfs_rq, se);
3500 3497
3501 decayed = update_cfs_rq_load_avg(now, cfs_rq); 3498 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3502 decayed |= propagate_entity_load_avg(se); 3499 decayed |= propagate_entity_load_avg(se);
@@ -3548,7 +3545,7 @@ void sync_entity_load_avg(struct sched_entity *se)
3548 u64 last_update_time; 3545 u64 last_update_time;
3549 3546
3550 last_update_time = cfs_rq_last_update_time(cfs_rq); 3547 last_update_time = cfs_rq_last_update_time(cfs_rq);
3551 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); 3548 __update_load_avg_blocked_se(last_update_time, se);
3552} 3549}
3553 3550
3554/* 3551/*
@@ -7015,6 +7012,12 @@ idle:
7015 if (new_tasks > 0) 7012 if (new_tasks > 0)
7016 goto again; 7013 goto again;
7017 7014
7015 /*
7016 * rq is about to be idle, check if we need to update the
7017 * lost_idle_time of clock_pelt
7018 */
7019 update_idle_rq_clock_pelt(rq);
7020
7018 return NULL; 7021 return NULL;
7019} 7022}
7020 7023
@@ -7657,7 +7660,7 @@ static void update_blocked_averages(int cpu)
7657 if (throttled_hierarchy(cfs_rq)) 7660 if (throttled_hierarchy(cfs_rq))
7658 continue; 7661 continue;
7659 7662
7660 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) 7663 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
7661 update_tg_load_avg(cfs_rq, 0); 7664 update_tg_load_avg(cfs_rq, 0);
7662 7665
7663 /* Propagate pending load changes to the parent, if any: */ 7666 /* Propagate pending load changes to the parent, if any: */
@@ -7671,8 +7674,8 @@ static void update_blocked_averages(int cpu)
7671 } 7674 }
7672 7675
7673 curr_class = rq->curr->sched_class; 7676 curr_class = rq->curr->sched_class;
7674 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); 7677 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7675 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); 7678 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7676 update_irq_load_avg(rq, 0); 7679 update_irq_load_avg(rq, 0);
7677 /* Don't need periodic decay once load/util_avg are null */ 7680 /* Don't need periodic decay once load/util_avg are null */
7678 if (others_have_blocked(rq)) 7681 if (others_have_blocked(rq))
@@ -7742,11 +7745,11 @@ static inline void update_blocked_averages(int cpu)
7742 7745
7743 rq_lock_irqsave(rq, &rf); 7746 rq_lock_irqsave(rq, &rf);
7744 update_rq_clock(rq); 7747 update_rq_clock(rq);
7745 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7748 update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7746 7749
7747 curr_class = rq->curr->sched_class; 7750 curr_class = rq->curr->sched_class;
7748 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); 7751 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7749 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); 7752 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7750 update_irq_load_avg(rq, 0); 7753 update_irq_load_avg(rq, 0);
7751#ifdef CONFIG_NO_HZ_COMMON 7754#ifdef CONFIG_NO_HZ_COMMON
7752 rq->last_blocked_load_update_tick = jiffies; 7755 rq->last_blocked_load_update_tick = jiffies;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..befce29bd882 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include "sched.h" 28#include "sched.h"
29#include "sched-pelt.h"
30#include "pelt.h" 29#include "pelt.h"
31 30
32/* 31/*
@@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
106 * n=1 105 * n=1
107 */ 106 */
108static __always_inline u32 107static __always_inline u32
109accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, 108accumulate_sum(u64 delta, struct sched_avg *sa,
110 unsigned long load, unsigned long runnable, int running) 109 unsigned long load, unsigned long runnable, int running)
111{ 110{
112 unsigned long scale_freq, scale_cpu;
113 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ 111 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
114 u64 periods; 112 u64 periods;
115 113
116 scale_freq = arch_scale_freq_capacity(cpu);
117 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
118
119 delta += sa->period_contrib; 114 delta += sa->period_contrib;
120 periods = delta / 1024; /* A period is 1024us (~1ms) */ 115 periods = delta / 1024; /* A period is 1024us (~1ms) */
121 116
@@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
137 } 132 }
138 sa->period_contrib = delta; 133 sa->period_contrib = delta;
139 134
140 contrib = cap_scale(contrib, scale_freq);
141 if (load) 135 if (load)
142 sa->load_sum += load * contrib; 136 sa->load_sum += load * contrib;
143 if (runnable) 137 if (runnable)
144 sa->runnable_load_sum += runnable * contrib; 138 sa->runnable_load_sum += runnable * contrib;
145 if (running) 139 if (running)
146 sa->util_sum += contrib * scale_cpu; 140 sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
147 141
148 return periods; 142 return periods;
149} 143}
@@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
177 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 171 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
178 */ 172 */
179static __always_inline int 173static __always_inline int
180___update_load_sum(u64 now, int cpu, struct sched_avg *sa, 174___update_load_sum(u64 now, struct sched_avg *sa,
181 unsigned long load, unsigned long runnable, int running) 175 unsigned long load, unsigned long runnable, int running)
182{ 176{
183 u64 delta; 177 u64 delta;
@@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
221 * Step 1: accumulate *_sum since last_update_time. If we haven't 215 * Step 1: accumulate *_sum since last_update_time. If we haven't
222 * crossed period boundaries, finish. 216 * crossed period boundaries, finish.
223 */ 217 */
224 if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) 218 if (!accumulate_sum(delta, sa, load, runnable, running))
225 return 0; 219 return 0;
226 220
227 return 1; 221 return 1;
@@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
267 * runnable_load_avg = \Sum se->avg.runable_load_avg 261 * runnable_load_avg = \Sum se->avg.runable_load_avg
268 */ 262 */
269 263
270int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) 264int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
271{ 265{
272 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { 266 if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
273 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 267 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
274 return 1; 268 return 1;
275 } 269 }
@@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
277 return 0; 271 return 0;
278} 272}
279 273
280int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) 274int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
281{ 275{
282 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, 276 if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
283 cfs_rq->curr == se)) { 277 cfs_rq->curr == se)) {
284 278
285 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 279 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
@@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
290 return 0; 284 return 0;
291} 285}
292 286
293int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) 287int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
294{ 288{
295 if (___update_load_sum(now, cpu, &cfs_rq->avg, 289 if (___update_load_sum(now, &cfs_rq->avg,
296 scale_load_down(cfs_rq->load.weight), 290 scale_load_down(cfs_rq->load.weight),
297 scale_load_down(cfs_rq->runnable_weight), 291 scale_load_down(cfs_rq->runnable_weight),
298 cfs_rq->curr != NULL)) { 292 cfs_rq->curr != NULL)) {
@@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
317 311
318int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) 312int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
319{ 313{
320 if (___update_load_sum(now, rq->cpu, &rq->avg_rt, 314 if (___update_load_sum(now, &rq->avg_rt,
321 running, 315 running,
322 running, 316 running,
323 running)) { 317 running)) {
@@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
340 334
341int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) 335int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
342{ 336{
343 if (___update_load_sum(now, rq->cpu, &rq->avg_dl, 337 if (___update_load_sum(now, &rq->avg_dl,
344 running, 338 running,
345 running, 339 running,
346 running)) { 340 running)) {
@@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
365int update_irq_load_avg(struct rq *rq, u64 running) 359int update_irq_load_avg(struct rq *rq, u64 running)
366{ 360{
367 int ret = 0; 361 int ret = 0;
362
363 /*
364 * We can't use clock_pelt because irq time is not accounted in
365 * clock_task. Instead we directly scale the running time to
366 * reflect the real amount of computation
367 */
368 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
369 running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
370
368 /* 371 /*
369 * We know the time that has been used by interrupt since last update 372 * We know the time that has been used by interrupt since last update
370 * but we don't when. Let be pessimistic and assume that interrupt has 373 * but we don't when. Let be pessimistic and assume that interrupt has
371 * happened just before the update. This is not so far from reality 374 * happened just before the update. This is not so far from reality
372 * because interrupt will most probably wake up task and trig an update 375 * because interrupt will most probably wake up task and trig an update
373 * of rq clock during which the metric si updated. 376 * of rq clock during which the metric is updated.
374 * We start to decay with normal context time and then we add the 377 * We start to decay with normal context time and then we add the
375 * interrupt context time. 378 * interrupt context time.
376 * We can safely remove running from rq->clock because 379 * We can safely remove running from rq->clock because
377 * rq->clock += delta with delta >= running 380 * rq->clock += delta with delta >= running
378 */ 381 */
379 ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, 382 ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
380 0, 383 0,
381 0, 384 0,
382 0); 385 0);
383 ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, 386 ret += ___update_load_sum(rq->clock, &rq->avg_irq,
384 1, 387 1,
385 1, 388 1,
386 1); 389 1);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..7489d5f56960 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
1#ifdef CONFIG_SMP 1#ifdef CONFIG_SMP
2#include "sched-pelt.h"
2 3
3int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); 4int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
4int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); 5int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
5int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); 6int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
6int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); 7int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
7int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); 8int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
8 9
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
42 WRITE_ONCE(avg->util_est.enqueued, enqueued); 43 WRITE_ONCE(avg->util_est.enqueued, enqueued);
43} 44}
44 45
46/*
47 * The clock_pelt scales the time to reflect the effective amount of
48 * computation done during the running delta time but then sync back to
49 * clock_task when rq is idle.
50 *
51 *
52 * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
53 * @ max capacity ------******---------------******---------------
54 * @ half capacity ------************---------************---------
55 * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
56 *
57 */
58static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
59{
60 if (unlikely(is_idle_task(rq->curr))) {
61 /* The rq is idle, we can sync to clock_task */
62 rq->clock_pelt = rq_clock_task(rq);
63 return;
64 }
65
66 /*
67 * When a rq runs at a lower compute capacity, it will need
68 * more time to do the same amount of work than at max
69 * capacity. In order to be invariant, we scale the delta to
70 * reflect how much work has been really done.
71 * Running longer results in stealing idle time that will
72 * disturb the load signal compared to max capacity. This
73 * stolen idle time will be automatically reflected when the
74 * rq will be idle and the clock will be synced with
75 * rq_clock_task.
76 */
77
78 /*
79 * Scale the elapsed time to reflect the real amount of
80 * computation
81 */
82 delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
84
85 rq->clock_pelt += delta;
86}
87
88/*
89 * When rq becomes idle, we have to check if it has lost idle time
90 * because it was fully busy. A rq is fully used when the /Sum util_sum
91 * is greater or equal to:
92 * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
93 * For optimization and computing rounding purpose, we don't take into account
94 * the position in the current window (period_contrib) and we use the higher
95 * bound of util_sum to decide.
96 */
97static inline void update_idle_rq_clock_pelt(struct rq *rq)
98{
99 u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
100 u32 util_sum = rq->cfs.avg.util_sum;
101 util_sum += rq->avg_rt.util_sum;
102 util_sum += rq->avg_dl.util_sum;
103
104 /*
105 * Reflecting stolen time makes sense only if the idle
106 * phase would be present at max capacity. As soon as the
107 * utilization of a rq has reached the maximum value, it is
108 * considered as an always runnig rq without idle time to
109 * steal. This potential idle time is considered as lost in
110 * this case. We keep track of this lost idle time compare to
111 * rq's clock_task.
112 */
113 if (util_sum >= divider)
114 rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
115}
116
117static inline u64 rq_clock_pelt(struct rq *rq)
118{
119 lockdep_assert_held(&rq->lock);
120 assert_clock_updated(rq);
121
122 return rq->clock_pelt - rq->lost_idle_time;
123}
124
125#ifdef CONFIG_CFS_BANDWIDTH
126/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
127static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
128{
129 if (unlikely(cfs_rq->throttle_count))
130 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
131
132 return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
133}
134#else
135static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
136{
137 return rq_clock_pelt(rq_of(cfs_rq));
138}
139#endif
140
45#else 141#else
46 142
47static inline int 143static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
67{ 163{
68 return 0; 164 return 0;
69} 165}
166
167static inline u64 rq_clock_pelt(struct rq *rq)
168{
169 return rq_clock_task(rq);
170}
171
172static inline void
173update_rq_clock_pelt(struct rq *rq, s64 delta) { }
174
175static inline void
176update_idle_rq_clock_pelt(struct rq *rq) { }
177
70#endif 178#endif
71 179
72 180
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4f398ad9e73..90fa23d36565 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1587 * rt task 1587 * rt task
1588 */ 1588 */
1589 if (rq->curr->sched_class != &rt_sched_class) 1589 if (rq->curr->sched_class != &rt_sched_class)
1590 update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); 1590 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1591 1591
1592 return p; 1592 return p;
1593} 1593}
@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1596{ 1596{
1597 update_curr_rt(rq); 1597 update_curr_rt(rq);
1598 1598
1599 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); 1599 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1600 1600
1601 /* 1601 /*
1602 * The previous task needs to be made eligible for pushing 1602 * The previous task needs to be made eligible for pushing
@@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2325 struct sched_rt_entity *rt_se = &p->rt; 2325 struct sched_rt_entity *rt_se = &p->rt;
2326 2326
2327 update_curr_rt(rq); 2327 update_curr_rt(rq);
2328 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); 2328 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2329 2329
2330 watchdog(rq, p); 2330 watchdog(rq, p);
2331 2331
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0ed130fae2a9..fe31bc472f3e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -861,7 +861,10 @@ struct rq {
861 861
862 unsigned int clock_update_flags; 862 unsigned int clock_update_flags;
863 u64 clock; 863 u64 clock;
864 u64 clock_task; 864 /* Ensure that all clocks are in the same cache line */
865 u64 clock_task ____cacheline_aligned;
866 u64 clock_pelt;
867 unsigned long lost_idle_time;
865 868
866 atomic_t nr_iowait; 869 atomic_t nr_iowait;
867 870