summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorPatrick Bellasi <patrick.bellasi@arm.com>2018-03-09 04:52:42 -0500
committerIngo Molnar <mingo@kernel.org>2018-03-20 03:11:06 -0400
commit7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 (patch)
tree67b6425dc2cf32268acfe36e7324bb583579326f /kernel/sched/fair.c
parent10c18c44a6494167e7a7ca3a3a61a67972017bdf (diff)
sched/fair: Add util_est on top of PELT
The util_avg signal computed by PELT is too variable for some use-cases. For example, a big task waking up after a long sleep period will have its utilization almost completely decayed. This introduces some latency before schedutil will be able to pick the best frequency to run a task. The same issue can affect task placement. Indeed, since the task utilization is already decayed at wakeup, when the task is enqueued in a CPU, this can result in a CPU running a big task as being temporarily represented as being almost empty. This leads to a race condition where other tasks can be potentially allocated on a CPU which just started to run a big task which slept for a relatively long period. Moreover, the PELT utilization of a task can be updated every [ms], thus making it a continuously changing value for certain longer running tasks. This means that the instantaneous PELT utilization of a RUNNING task is not really meaningful to properly support scheduler decisions. For all these reasons, a more stable signal can do a better job of representing the expected/estimated utilization of a task/cfs_rq. Such a signal can be easily created on top of PELT by still using it as an estimator which produces values to be aggregated on meaningful events. This patch adds a simple implementation of util_est, a new signal built on top of PELT's util_avg where: util_est(task) = max(task::util_avg, f(task::util_avg@dequeue)) This allows to remember how big a task has been reported by PELT in its previous activations via f(task::util_avg@dequeue), which is the new _task_util_est(struct task_struct*) function added by this patch. If a task should change its behavior and it runs longer in a new activation, after a certain time its util_est will just track the original PELT signal (i.e. task::util_avg). The estimated utilization of cfs_rq is defined only for root ones. That's because the only sensible consumer of this signal are the scheduler and schedutil when looking for the overall CPU utilization due to FAIR tasks. For this reason, the estimated utilization of a root cfs_rq is simply defined as: util_est(cfs_rq) = max(cfs_rq::util_avg, cfs_rq::util_est::enqueued) where: cfs_rq::util_est::enqueued = sum(_task_util_est(task)) for each RUNNABLE task on that root cfs_rq It's worth noting that the estimated utilization is tracked only for objects of interests, specifically: - Tasks: to better support tasks placement decisions - root cfs_rqs: to better support both tasks placement decisions as well as frequencies selection Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@android.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: http://lkml.kernel.org/r/20180309095245.11071-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c122
1 files changed, 116 insertions, 6 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3582117e1580..22b59a7facd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3873 3873
3874static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3874static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3875 3875
3876static inline unsigned long task_util(struct task_struct *p)
3877{
3878 return READ_ONCE(p->se.avg.util_avg);
3879}
3880
3881static inline unsigned long _task_util_est(struct task_struct *p)
3882{
3883 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3884
3885 return max(ue.ewma, ue.enqueued);
3886}
3887
3888static inline unsigned long task_util_est(struct task_struct *p)
3889{
3890 return max(task_util(p), _task_util_est(p));
3891}
3892
3893static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3894 struct task_struct *p)
3895{
3896 unsigned int enqueued;
3897
3898 if (!sched_feat(UTIL_EST))
3899 return;
3900
3901 /* Update root cfs_rq's estimated utilization */
3902 enqueued = cfs_rq->avg.util_est.enqueued;
3903 enqueued += _task_util_est(p);
3904 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3905}
3906
3907/*
3908 * Check if a (signed) value is within a specified (unsigned) margin,
3909 * based on the observation that:
3910 *
3911 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3912 *
3913 * NOTE: this only works when value + maring < INT_MAX.
3914 */
3915static inline bool within_margin(int value, int margin)
3916{
3917 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3918}
3919
3920static void
3921util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3922{
3923 long last_ewma_diff;
3924 struct util_est ue;
3925
3926 if (!sched_feat(UTIL_EST))
3927 return;
3928
3929 /*
3930 * Update root cfs_rq's estimated utilization
3931 *
3932 * If *p is the last task then the root cfs_rq's estimated utilization
3933 * of a CPU is 0 by definition.
3934 */
3935 ue.enqueued = 0;
3936 if (cfs_rq->nr_running) {
3937 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3938 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3939 _task_util_est(p));
3940 }
3941 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3942
3943 /*
3944 * Skip update of task's estimated utilization when the task has not
3945 * yet completed an activation, e.g. being migrated.
3946 */
3947 if (!task_sleep)
3948 return;
3949
3950 /*
3951 * Skip update of task's estimated utilization when its EWMA is
3952 * already ~1% close to its last activation value.
3953 */
3954 ue = p->se.avg.util_est;
3955 ue.enqueued = task_util(p);
3956 last_ewma_diff = ue.enqueued - ue.ewma;
3957 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3958 return;
3959
3960 /*
3961 * Update Task's estimated utilization
3962 *
3963 * When *p completes an activation we can consolidate another sample
3964 * of the task size. This is done by storing the current PELT value
3965 * as ue.enqueued and by using this value to update the Exponential
3966 * Weighted Moving Average (EWMA):
3967 *
3968 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
3969 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
3970 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
3971 * = w * ( last_ewma_diff ) + ewma(t-1)
3972 * = w * (last_ewma_diff + ewma(t-1) / w)
3973 *
3974 * Where 'w' is the weight of new samples, which is configured to be
3975 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3976 */
3977 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3978 ue.ewma += last_ewma_diff;
3979 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3980 WRITE_ONCE(p->se.avg.util_est, ue);
3981}
3982
3876#else /* CONFIG_SMP */ 3983#else /* CONFIG_SMP */
3877 3984
3878static inline int 3985static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3902 return 0; 4009 return 0;
3903} 4010}
3904 4011
4012static inline void
4013util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4014
4015static inline void
4016util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4017 bool task_sleep) {}
4018
3905#endif /* CONFIG_SMP */ 4019#endif /* CONFIG_SMP */
3906 4020
3907static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 4021static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5249 if (!se) 5363 if (!se)
5250 add_nr_running(rq, 1); 5364 add_nr_running(rq, 1);
5251 5365
5366 util_est_enqueue(&rq->cfs, p);
5252 hrtick_update(rq); 5367 hrtick_update(rq);
5253} 5368}
5254 5369
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5308 if (!se) 5423 if (!se)
5309 sub_nr_running(rq, 1); 5424 sub_nr_running(rq, 1);
5310 5425
5426 util_est_dequeue(&rq->cfs, p, task_sleep);
5311 hrtick_update(rq); 5427 hrtick_update(rq);
5312} 5428}
5313 5429
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5835 return target; 5951 return target;
5836} 5952}
5837 5953
5838static inline unsigned long task_util(struct task_struct *p);
5839static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5954static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5840 5955
5841static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5956static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
6351 return (util >= capacity) ? capacity : util; 6466 return (util >= capacity) ? capacity : util;
6352} 6467}
6353 6468
6354static inline unsigned long task_util(struct task_struct *p)
6355{
6356 return p->se.avg.util_avg;
6357}
6358
6359/* 6469/*
6360 * cpu_util_wake: Compute CPU utilization with any contributions from 6470 * cpu_util_wake: Compute CPU utilization with any contributions from
6361 * the waking task p removed. 6471 * the waking task p removed.