summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrick Bellasi <patrick.bellasi@arm.com>2018-03-09 04:52:45 -0500
committerIngo Molnar <mingo@kernel.org>2018-03-20 03:11:09 -0400
commitd519329f72a6f36bc4f2b85452640cfe583b4f81 (patch)
tree8be90588a6d222f84108687cbdaa5cb46702e270
parenta07630b8b2c16f82fd5b71d890079f4dd7599c1d (diff)
sched/fair: Update util_est only on util_avg updates
The estimated utilization of a task is currently updated every time the task is dequeued. However, to keep overheads under control, PELT signals are effectively updated at maximum once every 1ms. Thus, for really short running tasks, it can happen that their util_avg value has not been updates since their last enqueue. If such tasks are also frequently running tasks (e.g. the kind of workload generated by hackbench) it can also happen that their util_avg is updated only every few activations. This means that updating util_est at every dequeue potentially introduces not necessary overheads and it's also conceptually wrong if the util_avg signal has never been updated during a task activation. Let's introduce a throttling mechanism on task's util_est updates to sync them with util_avg updates. To make the solution memory efficient, both in terms of space and load/store operations, we encode a synchronization flag into the LSB of util_est.enqueued. This makes util_est an even values only metric, which is still considered good enough for its purpose. The synchronization bit is (re)set by __update_load_avg_se() once the PELT signal of a task has been updated during its last activation. Such a throttling mechanism allows to keep under control util_est overheads in the wakeup hot path, thus making it a suitable mechanism which can be enabled also on high-intensity workload systems. Thus, this now switches on by default the estimation utilization scheduler feature. Suggested-by: Chris Redpath <chris.redpath@arm.com> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@android.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: http://lkml.kernel.org/r/20180309095245.11071-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/features.h2
2 files changed, 39 insertions, 5 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 570b8d056282..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3243,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
3243} 3243}
3244 3244
3245/* 3245/*
3246 * When a task is dequeued, its estimated utilization should not be update if
3247 * its util_avg has not been updated at least once.
3248 * This flag is used to synchronize util_avg updates with util_est updates.
3249 * We map this information into the LSB bit of the utilization saved at
3250 * dequeue time (i.e. util_est.dequeued).
3251 */
3252#define UTIL_AVG_UNCHANGED 0x1
3253
3254static inline void cfs_se_util_change(struct sched_avg *avg)
3255{
3256 unsigned int enqueued;
3257
3258 if (!sched_feat(UTIL_EST))
3259 return;
3260
3261 /* Avoid store if the flag has been already set */
3262 enqueued = avg->util_est.enqueued;
3263 if (!(enqueued & UTIL_AVG_UNCHANGED))
3264 return;
3265
3266 /* Reset flag to report util_avg has been updated */
3267 enqueued &= ~UTIL_AVG_UNCHANGED;
3268 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3269}
3270
3271/*
3246 * sched_entity: 3272 * sched_entity:
3247 * 3273 *
3248 * task: 3274 * task:
@@ -3293,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
3293 cfs_rq->curr == se)) { 3319 cfs_rq->curr == se)) {
3294 3320
3295 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3321 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3322 cfs_se_util_change(&se->avg);
3296 return 1; 3323 return 1;
3297 } 3324 }
3298 3325
@@ -3900,7 +3927,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3900 3927
3901 /* Update root cfs_rq's estimated utilization */ 3928 /* Update root cfs_rq's estimated utilization */
3902 enqueued = cfs_rq->avg.util_est.enqueued; 3929 enqueued = cfs_rq->avg.util_est.enqueued;
3903 enqueued += _task_util_est(p); 3930 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3904 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); 3931 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3905} 3932}
3906 3933
@@ -3936,7 +3963,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3936 if (cfs_rq->nr_running) { 3963 if (cfs_rq->nr_running) {
3937 ue.enqueued = cfs_rq->avg.util_est.enqueued; 3964 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3938 ue.enqueued -= min_t(unsigned int, ue.enqueued, 3965 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3939 _task_util_est(p)); 3966 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3940 } 3967 }
3941 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); 3968 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3942 3969
@@ -3948,11 +3975,18 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3948 return; 3975 return;
3949 3976
3950 /* 3977 /*
3978 * If the PELT values haven't changed since enqueue time,
3979 * skip the util_est update.
3980 */
3981 ue = p->se.avg.util_est;
3982 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3983 return;
3984
3985 /*
3951 * Skip update of task's estimated utilization when its EWMA is 3986 * Skip update of task's estimated utilization when its EWMA is
3952 * already ~1% close to its last activation value. 3987 * already ~1% close to its last activation value.
3953 */ 3988 */
3954 ue = p->se.avg.util_est; 3989 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3955 ue.enqueued = task_util(p);
3956 last_ewma_diff = ue.enqueued - ue.ewma; 3990 last_ewma_diff = ue.enqueued - ue.ewma;
3957 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) 3991 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3958 return; 3992 return;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index c459a4b61544..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,4 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
89/* 89/*
90 * UtilEstimation. Use estimated CPU utilization. 90 * UtilEstimation. Use estimated CPU utilization.
91 */ 91 */
92SCHED_FEAT(UTIL_EST, false) 92SCHED_FEAT(UTIL_EST, true)