sched/fair: Add util_est on top of PELT

The util_avg signal computed by PELT is too variable for some use-cases. For example, a big task waking up after a long sleep period will have its utilization almost completely decayed. This introduces some latency before schedutil will be able to pick the best frequency to run a task. The same issue can affect task placement. Indeed, since the task utilization is already decayed at wakeup, when the task is enqueued in a CPU, this can result in a CPU running a big task as being temporarily represented as being almost empty. This leads to a race condition where other tasks can be potentially allocated on a CPU which just started to run a big task which slept for a relatively long period. Moreover, the PELT utilization of a task can be updated every [ms], thus making it a continuously changing value for certain longer running tasks. This means that the instantaneous PELT utilization of a RUNNING task is not really meaningful to properly support scheduler decisions. For all these reasons, a more stable signal can do a better job of representing the expected/estimated utilization of a task/cfs_rq. Such a signal can be easily created on top of PELT by still using it as an estimator which produces values to be aggregated on meaningful events. This patch adds a simple implementation of util_est, a new signal built on top of PELT's util_avg where: util_est(task) = max(task::util_avg, f(task::util_avg@dequeue)) This allows to remember how big a task has been reported by PELT in its previous activations via f(task::util_avg@dequeue), which is the new _task_util_est(struct task_struct*) function added by this patch. If a task should change its behavior and it runs longer in a new activation, after a certain time its util_est will just track the original PELT signal (i.e. task::util_avg). The estimated utilization of cfs_rq is defined only for root ones. That's because the only sensible consumer of this signal are the scheduler and schedutil when looking for the overall CPU utilization due to FAIR tasks. For this reason, the estimated utilization of a root cfs_rq is simply defined as: util_est(cfs_rq) = max(cfs_rq::util_avg, cfs_rq::util_est::enqueued) where: cfs_rq::util_est::enqueued = sum(_task_util_est(task)) for each RUNNABLE task on that root cfs_rq It's worth noting that the estimated utilization is tracked only for objects of interests, specifically: - Tasks: to better support tasks placement decisions - root cfs_rqs: to better support both tasks placement decisions as well as frequencies selection Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@android.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: http://lkml.kernel.org/r/20180309095245.11071-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Patrick Bellasi <patrick.bellasi@arm.com> 2018-03-09 04:52:42 -0500
committer: Ingo Molnar <mingo@kernel.org> 2018-03-20 03:11:06 -0400
commit: 7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 (patch)
tree: 67b6425dc2cf32268acfe36e7324bb583579326f /kernel/sched/fair.c
parent: 10c18c44a6494167e7a7ca3a3a61a67972017bdf (diff)
1 files changed, 116 insertions, 6 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3582117e1580..22b59a7facd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static inline unsigned long task_util(struct task_struct *p)
+{
+        return READ_ONCE(p->se.avg.util_avg);
+}
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+        struct util_est ue = READ_ONCE(p->se.avg.util_est);
+        return max(ue.ewma, ue.enqueued);
+}
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+        return max(task_util(p), _task_util_est(p));
+}
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+                                    struct task_struct *p)
+{
+        unsigned int enqueued;
+        if (!sched_feat(UTIL_EST))
+                return;
+        /* Update root cfs_rq's estimated utilization */
+        enqueued  = cfs_rq->avg.util_est.enqueued;
+        enqueued += _task_util_est(p);
+        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+        long last_ewma_diff;
+        struct util_est ue;
+        if (!sched_feat(UTIL_EST))
+                return;
+        /*
+         * Update root cfs_rq's estimated utilization
+         *
+         * If *p is the last task then the root cfs_rq's estimated utilization
+         * of a CPU is 0 by definition.
+         */
+        ue.enqueued = 0;
+        if (cfs_rq->nr_running) {
+                ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+                ue.enqueued -= min_t(unsigned int, ue.enqueued,
+                                     _task_util_est(p));
+        }
+        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+        /*
+         * Skip update of task's estimated utilization when the task has not
+         * yet completed an activation, e.g. being migrated.
+         */
+        if (!task_sleep)
+                return;
+        /*
+         * Skip update of task's estimated utilization when its EWMA is
+         * already ~1% close to its last activation value.
+         */
+        ue = p->se.avg.util_est;
+        ue.enqueued = task_util(p);
+        last_ewma_diff = ue.enqueued - ue.ewma;
+        if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+                return;
+        /*
+         * Update Task's estimated utilization
+         *
+         * When *p completes an activation we can consolidate another sample
+         * of the task size. This is done by storing the current PELT value
+         * as ue.enqueued and by using this value to update the Exponential
+         * Weighted Moving Average (EWMA):
+         *
+         *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+         *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+         *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+         *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+         *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+         *
+         * Where 'w' is the weight of new samples, which is configured to be
+         * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+         */
+        ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+        ue.ewma  += last_ewma_diff;
+        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+        WRITE_ONCE(p->se.avg.util_est, ue);
+}
 #else /* CONFIG_SMP */
 static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
        return 0;
 }
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+                 bool task_sleep) {}
 #endif /* CONFIG_SMP */
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                add_nr_running(rq, 1);
+        util_est_enqueue(&rq->cfs, p);
        hrtick_update(rq);
 }
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                sub_nr_running(rq, 1);
+        util_est_dequeue(&rq->cfs, p, task_sleep);
        hrtick_update(rq);
 }
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        return target;
 }
-static inline unsigned long task_util(struct task_struct *p);
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
        return (util >= capacity) ? capacity : util;
 }
-static inline unsigned long task_util(struct task_struct *p)
-{
-        return p->se.avg.util_avg;
-}
 /*
 * cpu_util_wake: Compute CPU utilization with any contributions from
 * the waking task p removed.
author	Patrick Bellasi <patrick.bellasi@arm.com>	2018-03-09 04:52:42 -0500
committer	Ingo Molnar <mingo@kernel.org>	2018-03-20 03:11:06 -0400
commit	7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 (patch)
tree	67b6425dc2cf32268acfe36e7324bb583579326f /kernel/sched/fair.c
parent	10c18c44a6494167e7a7ca3a3a61a67972017bdf (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3582117e1580..22b59a7facd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3873		3873
3874	static int idle_balance(struct rq this_rq, struct rq_flags rf);	3874	static int idle_balance(struct rq this_rq, struct rq_flags rf);
3875		3875
		3876	static inline unsigned long task_util(struct task_struct *p)
		3877	{
		3878	return READ_ONCE(p->se.avg.util_avg);
		3879	}
		3880
		3881	static inline unsigned long _task_util_est(struct task_struct *p)
		3882	{
		3883	struct util_est ue = READ_ONCE(p->se.avg.util_est);
		3884
		3885	return max(ue.ewma, ue.enqueued);
		3886	}
		3887
		3888	static inline unsigned long task_util_est(struct task_struct *p)
		3889	{
		3890	return max(task_util(p), _task_util_est(p));
		3891	}
		3892
		3893	static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
		3894	struct task_struct *p)
		3895	{
		3896	unsigned int enqueued;
		3897
		3898	if (!sched_feat(UTIL_EST))
		3899	return;
		3900
		3901	/* Update root cfs_rq's estimated utilization */
		3902	enqueued = cfs_rq->avg.util_est.enqueued;
		3903	enqueued += _task_util_est(p);
		3904	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
		3905	}
		3906
		3907	/*
		3908	* Check if a (signed) value is within a specified (unsigned) margin,
		3909	* based on the observation that:
		3910	*
		3911	* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
		3912	*
		3913	* NOTE: this only works when value + maring < INT_MAX.
		3914	*/
		3915	static inline bool within_margin(int value, int margin)
		3916	{
		3917	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
		3918	}
		3919
		3920	static void
		3921	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
		3922	{
		3923	long last_ewma_diff;
		3924	struct util_est ue;
		3925
		3926	if (!sched_feat(UTIL_EST))
		3927	return;
		3928
		3929	/*
		3930	* Update root cfs_rq's estimated utilization
		3931	*
		3932	* If *p is the last task then the root cfs_rq's estimated utilization
		3933	* of a CPU is 0 by definition.
		3934	*/
		3935	ue.enqueued = 0;
		3936	if (cfs_rq->nr_running) {
		3937	ue.enqueued = cfs_rq->avg.util_est.enqueued;
		3938	ue.enqueued -= min_t(unsigned int, ue.enqueued,
		3939	_task_util_est(p));
		3940	}
		3941	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
		3942
		3943	/*
		3944	* Skip update of task's estimated utilization when the task has not
		3945	* yet completed an activation, e.g. being migrated.
		3946	*/
		3947	if (!task_sleep)
		3948	return;
		3949
		3950	/*
		3951	* Skip update of task's estimated utilization when its EWMA is
		3952	* already ~1% close to its last activation value.
		3953	*/
		3954	ue = p->se.avg.util_est;
		3955	ue.enqueued = task_util(p);
		3956	last_ewma_diff = ue.enqueued - ue.ewma;
		3957	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
		3958	return;
		3959
		3960	/*
		3961	* Update Task's estimated utilization
		3962	*
		3963	* When *p completes an activation we can consolidate another sample
		3964	* of the task size. This is done by storing the current PELT value
		3965	* as ue.enqueued and by using this value to update the Exponential
		3966	* Weighted Moving Average (EWMA):
		3967	*
		3968	* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
		3969	* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
		3970	* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
		3971	* = w * ( last_ewma_diff ) + ewma(t-1)
		3972	* = w * (last_ewma_diff + ewma(t-1) / w)
		3973	*
		3974	* Where 'w' is the weight of new samples, which is configured to be
		3975	* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
		3976	*/
		3977	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
		3978	ue.ewma += last_ewma_diff;
		3979	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
		3980	WRITE_ONCE(p->se.avg.util_est, ue);
		3981	}
		3982
3876	#else /* CONFIG_SMP */	3983	#else /* CONFIG_SMP */
3877		3984
3878	static inline int	3985	static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq rq, struct rq_flags rf)
3902	return 0;	4009	return 0;
3903	}	4010	}
3904		4011
		4012	static inline void
		4013	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
		4014
		4015	static inline void
		4016	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
		4017	bool task_sleep) {}
		4018
3905	#endif /* CONFIG_SMP */	4019	#endif /* CONFIG_SMP */
3906		4020
3907	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)	4021	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
5249	if (!se)	5363	if (!se)
5250	add_nr_running(rq, 1);	5364	add_nr_running(rq, 1);
5251		5365
		5366	util_est_enqueue(&rq->cfs, p);
5252	hrtick_update(rq);	5367	hrtick_update(rq);
5253	}	5368	}
5254		5369
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
5308	if (!se)	5423	if (!se)
5309	sub_nr_running(rq, 1);	5424	sub_nr_running(rq, 1);
5310		5425
		5426	util_est_dequeue(&rq->cfs, p, task_sleep);
5311	hrtick_update(rq);	5427	hrtick_update(rq);
5312	}	5428	}
5313		5429
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
5835	return target;	5951	return target;
5836	}	5952	}
5837		5953
5838	static inline unsigned long task_util(struct task_struct *p);
5839	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);	5954	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5840		5955
5841	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)	5956	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
6351	return (util >= capacity) ? capacity : util;	6466	return (util >= capacity) ? capacity : util;
6352	}	6467	}
6353		6468
6354	static inline unsigned long task_util(struct task_struct *p)
6355	{
6356	return p->se.avg.util_avg;
6357	}
6358
6359	/*	6469	/*
6360	* cpu_util_wake: Compute CPU utilization with any contributions from	6470	* cpu_util_wake: Compute CPU utilization with any contributions from
6361	* the waking task p removed.	6471	* the waking task p removed.