4 files changed, 154 insertions, 6 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21b1168da951..f228c6033832 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,6 +274,34 @@ struct load_weight {
        u32                             inv_weight;
 };
+/**
+ * struct util_est - Estimation utilization of FAIR tasks
+ * @enqueued: instantaneous estimated utilization of a task/cpu
+ * @ewma:     the Exponential Weighted Moving Average (EWMA)
+ *            utilization of a task
+ *
+ * Support data structure to track an Exponential Weighted Moving Average
+ * (EWMA) of a FAIR task's utilization. New samples are added to the moving
+ * average each time a task completes an activation. Sample's weight is chosen
+ * so that the EWMA will be relatively insensitive to transient changes to the
+ * task's workload.
+ *
+ * The enqueued attribute has a slightly different meaning for tasks and cpus:
+ * - task:   the task's util_avg at last task dequeue time
+ * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
+ * Thus, the util_est.enqueued of a task represents the contribution on the
+ * estimated utilization of the CPU where that task is currently enqueued.
+ *
+ * Only for tasks we track a moving average of the past instantaneous
+ * estimated utilization. This allows to absorb sporadic drops in utilization
+ * of an otherwise almost periodic task.
+ */
+struct util_est {
+        unsigned int                    enqueued;
+        unsigned int                    ewma;
+#define UTIL_EST_WEIGHT_SHIFT           2
+};
 /*
 * The load_avg/util_avg accumulates an infinite geometric series
 * (see __update_load_avg() in kernel/sched/fair.c).
@@ -335,6 +363,7 @@ struct sched_avg {
        unsigned long                   load_avg;
        unsigned long                   runnable_load_avg;
        unsigned long                   util_avg;
+        struct util_est                 util_est;
 };
 struct sched_statistics {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 644d9a464380..332303be4beb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->avg.runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                        cfs_rq->avg.util_avg);
+        SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
+                        cfs_rq->avg.util_est.enqueued);
        SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
                        cfs_rq->removed.load_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
        P(se.avg.runnable_load_avg);
        P(se.avg.util_avg);
        P(se.avg.last_update_time);
+        P(se.avg.util_est.ewma);
+        P(se.avg.util_est.enqueued);
 #endif
        P(policy);
        P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3582117e1580..22b59a7facd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static inline unsigned long task_util(struct task_struct *p)
+{
+        return READ_ONCE(p->se.avg.util_avg);
+}
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+        struct util_est ue = READ_ONCE(p->se.avg.util_est);
+        return max(ue.ewma, ue.enqueued);
+}
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+        return max(task_util(p), _task_util_est(p));
+}
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+                                    struct task_struct *p)
+{
+        unsigned int enqueued;
+        if (!sched_feat(UTIL_EST))
+                return;
+        /* Update root cfs_rq's estimated utilization */
+        enqueued  = cfs_rq->avg.util_est.enqueued;
+        enqueued += _task_util_est(p);
+        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+        long last_ewma_diff;
+        struct util_est ue;
+        if (!sched_feat(UTIL_EST))
+                return;
+        /*
+         * Update root cfs_rq's estimated utilization
+         *
+         * If *p is the last task then the root cfs_rq's estimated utilization
+         * of a CPU is 0 by definition.
+         */
+        ue.enqueued = 0;
+        if (cfs_rq->nr_running) {
+                ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+                ue.enqueued -= min_t(unsigned int, ue.enqueued,
+                                     _task_util_est(p));
+        }
+        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+        /*
+         * Skip update of task's estimated utilization when the task has not
+         * yet completed an activation, e.g. being migrated.
+         */
+        if (!task_sleep)
+                return;
+        /*
+         * Skip update of task's estimated utilization when its EWMA is
+         * already ~1% close to its last activation value.
+         */
+        ue = p->se.avg.util_est;
+        ue.enqueued = task_util(p);
+        last_ewma_diff = ue.enqueued - ue.ewma;
+        if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+                return;
+        /*
+         * Update Task's estimated utilization
+         *
+         * When *p completes an activation we can consolidate another sample
+         * of the task size. This is done by storing the current PELT value
+         * as ue.enqueued and by using this value to update the Exponential
+         * Weighted Moving Average (EWMA):
+         *
+         *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+         *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+         *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+         *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+         *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+         *
+         * Where 'w' is the weight of new samples, which is configured to be
+         * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+         */
+        ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+        ue.ewma  += last_ewma_diff;
+        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+        WRITE_ONCE(p->se.avg.util_est, ue);
+}
 #else /* CONFIG_SMP */
 static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
        return 0;
 }
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+                 bool task_sleep) {}
 #endif /* CONFIG_SMP */
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                add_nr_running(rq, 1);
+        util_est_enqueue(&rq->cfs, p);
        hrtick_update(rq);
 }
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                sub_nr_running(rq, 1);
+        util_est_dequeue(&rq->cfs, p, task_sleep);
        hrtick_update(rq);
 }
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        return target;
 }
-static inline unsigned long task_util(struct task_struct *p);
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
        return (util >= capacity) ? capacity : util;
 }
-static inline unsigned long task_util(struct task_struct *p)
-{
-        return p->se.avg.util_avg;
-}
 /*
 * cpu_util_wake: Compute CPU utilization with any contributions from
 * the waking task p removed.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..c459a4b61544 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
 SCHED_FEAT(WA_IDLE, true)
 SCHED_FEAT(WA_WEIGHT, true)
 SCHED_FEAT(WA_BIAS, true)
+/*
+ * UtilEstimation. Use estimated CPU utilization.
+ */
+SCHED_FEAT(UTIL_EST, false)

diff --git a/include/linux/sched.h b/include/linux/sched.h index 21b1168da951..f228c6033832 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -274,6 +274,34 @@ struct load_weight {
274	u32 inv_weight;	274	u32 inv_weight;
275	};	275	};
276		276
		277	/**
		278	* struct util_est - Estimation utilization of FAIR tasks
		279	* @enqueued: instantaneous estimated utilization of a task/cpu
		280	* @ewma: the Exponential Weighted Moving Average (EWMA)
		281	* utilization of a task
		282	*
		283	* Support data structure to track an Exponential Weighted Moving Average
		284	* (EWMA) of a FAIR task's utilization. New samples are added to the moving
		285	* average each time a task completes an activation. Sample's weight is chosen
		286	* so that the EWMA will be relatively insensitive to transient changes to the
		287	* task's workload.
		288	*
		289	* The enqueued attribute has a slightly different meaning for tasks and cpus:
		290	* - task: the task's util_avg at last task dequeue time
		291	* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
		292	* Thus, the util_est.enqueued of a task represents the contribution on the
		293	* estimated utilization of the CPU where that task is currently enqueued.
		294	*
		295	* Only for tasks we track a moving average of the past instantaneous
		296	* estimated utilization. This allows to absorb sporadic drops in utilization
		297	* of an otherwise almost periodic task.
		298	*/
		299	struct util_est {
		300	unsigned int enqueued;
		301	unsigned int ewma;
		302	#define UTIL_EST_WEIGHT_SHIFT 2
		303	};
		304
277	/*	305	/*
278	* The load_avg/util_avg accumulates an infinite geometric series	306	* The load_avg/util_avg accumulates an infinite geometric series
279	* (see __update_load_avg() in kernel/sched/fair.c).	307	* (see __update_load_avg() in kernel/sched/fair.c).
@@ -335,6 +363,7 @@ struct sched_avg {
335	unsigned long load_avg;	363	unsigned long load_avg;
336	unsigned long runnable_load_avg;	364	unsigned long runnable_load_avg;
337	unsigned long util_avg;	365	unsigned long util_avg;
		366	struct util_est util_est;
338	};	367	};
339		368
340	struct sched_statistics {	369	struct sched_statistics {


diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 644d9a464380..332303be4beb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file m, int cpu, struct cfs_rq cfs_rq)
541	cfs_rq->avg.runnable_load_avg);	541	cfs_rq->avg.runnable_load_avg);
542	SEQ_printf(m, " .%-30s: %lu\n", "util_avg",	542	SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
543	cfs_rq->avg.util_avg);	543	cfs_rq->avg.util_avg);
		544	SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
		545	cfs_rq->avg.util_est.enqueued);
544	SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",	546	SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
545	cfs_rq->removed.load_avg);	547	cfs_rq->removed.load_avg);
546	SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",	548	SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct p, struct pid_namespace ns,
989	P(se.avg.runnable_load_avg);	991	P(se.avg.runnable_load_avg);
990	P(se.avg.util_avg);	992	P(se.avg.util_avg);
991	P(se.avg.last_update_time);	993	P(se.avg.last_update_time);
		994	P(se.avg.util_est.ewma);
		995	P(se.avg.util_est.enqueued);
992	#endif	996	#endif
993	P(policy);	997	P(policy);
994	P(prio);	998	P(prio);


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3582117e1580..22b59a7facd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3873		3873
3874	static int idle_balance(struct rq this_rq, struct rq_flags rf);	3874	static int idle_balance(struct rq this_rq, struct rq_flags rf);
3875		3875
		3876	static inline unsigned long task_util(struct task_struct *p)
		3877	{
		3878	return READ_ONCE(p->se.avg.util_avg);
		3879	}
		3880
		3881	static inline unsigned long _task_util_est(struct task_struct *p)
		3882	{
		3883	struct util_est ue = READ_ONCE(p->se.avg.util_est);
		3884
		3885	return max(ue.ewma, ue.enqueued);
		3886	}
		3887
		3888	static inline unsigned long task_util_est(struct task_struct *p)
		3889	{
		3890	return max(task_util(p), _task_util_est(p));
		3891	}
		3892
		3893	static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
		3894	struct task_struct *p)
		3895	{
		3896	unsigned int enqueued;
		3897
		3898	if (!sched_feat(UTIL_EST))
		3899	return;
		3900
		3901	/* Update root cfs_rq's estimated utilization */
		3902	enqueued = cfs_rq->avg.util_est.enqueued;
		3903	enqueued += _task_util_est(p);
		3904	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
		3905	}
		3906
		3907	/*
		3908	* Check if a (signed) value is within a specified (unsigned) margin,
		3909	* based on the observation that:
		3910	*
		3911	* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
		3912	*
		3913	* NOTE: this only works when value + maring < INT_MAX.
		3914	*/
		3915	static inline bool within_margin(int value, int margin)
		3916	{
		3917	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
		3918	}
		3919
		3920	static void
		3921	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
		3922	{
		3923	long last_ewma_diff;
		3924	struct util_est ue;
		3925
		3926	if (!sched_feat(UTIL_EST))
		3927	return;
		3928
		3929	/*
		3930	* Update root cfs_rq's estimated utilization
		3931	*
		3932	* If *p is the last task then the root cfs_rq's estimated utilization
		3933	* of a CPU is 0 by definition.
		3934	*/
		3935	ue.enqueued = 0;
		3936	if (cfs_rq->nr_running) {
		3937	ue.enqueued = cfs_rq->avg.util_est.enqueued;
		3938	ue.enqueued -= min_t(unsigned int, ue.enqueued,
		3939	_task_util_est(p));
		3940	}
		3941	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
		3942
		3943	/*
		3944	* Skip update of task's estimated utilization when the task has not
		3945	* yet completed an activation, e.g. being migrated.
		3946	*/
		3947	if (!task_sleep)
		3948	return;
		3949
		3950	/*
		3951	* Skip update of task's estimated utilization when its EWMA is
		3952	* already ~1% close to its last activation value.
		3953	*/
		3954	ue = p->se.avg.util_est;
		3955	ue.enqueued = task_util(p);
		3956	last_ewma_diff = ue.enqueued - ue.ewma;
		3957	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
		3958	return;
		3959
		3960	/*
		3961	* Update Task's estimated utilization
		3962	*
		3963	* When *p completes an activation we can consolidate another sample
		3964	* of the task size. This is done by storing the current PELT value
		3965	* as ue.enqueued and by using this value to update the Exponential
		3966	* Weighted Moving Average (EWMA):
		3967	*
		3968	* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
		3969	* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
		3970	* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
		3971	* = w * ( last_ewma_diff ) + ewma(t-1)
		3972	* = w * (last_ewma_diff + ewma(t-1) / w)
		3973	*
		3974	* Where 'w' is the weight of new samples, which is configured to be
		3975	* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
		3976	*/
		3977	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
		3978	ue.ewma += last_ewma_diff;
		3979	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
		3980	WRITE_ONCE(p->se.avg.util_est, ue);
		3981	}
		3982
3876	#else /* CONFIG_SMP */	3983	#else /* CONFIG_SMP */
3877		3984
3878	static inline int	3985	static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq rq, struct rq_flags rf)
3902	return 0;	4009	return 0;
3903	}	4010	}
3904		4011
		4012	static inline void
		4013	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
		4014
		4015	static inline void
		4016	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
		4017	bool task_sleep) {}
		4018
3905	#endif /* CONFIG_SMP */	4019	#endif /* CONFIG_SMP */
3906		4020
3907	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)	4021	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
5249	if (!se)	5363	if (!se)
5250	add_nr_running(rq, 1);	5364	add_nr_running(rq, 1);
5251		5365
		5366	util_est_enqueue(&rq->cfs, p);
5252	hrtick_update(rq);	5367	hrtick_update(rq);
5253	}	5368	}
5254		5369
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
5308	if (!se)	5423	if (!se)
5309	sub_nr_running(rq, 1);	5424	sub_nr_running(rq, 1);
5310		5425
		5426	util_est_dequeue(&rq->cfs, p, task_sleep);
5311	hrtick_update(rq);	5427	hrtick_update(rq);
5312	}	5428	}
5313		5429
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
5835	return target;	5951	return target;
5836	}	5952	}
5837		5953
5838	static inline unsigned long task_util(struct task_struct *p);
5839	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);	5954	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5840		5955
5841	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)	5956	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
6351	return (util >= capacity) ? capacity : util;	6466	return (util >= capacity) ? capacity : util;
6352	}	6467	}
6353		6468
6354	static inline unsigned long task_util(struct task_struct *p)
6355	{
6356	return p->se.avg.util_avg;
6357	}
6358
6359	/*	6469	/*
6360	* cpu_util_wake: Compute CPU utilization with any contributions from	6470	* cpu_util_wake: Compute CPU utilization with any contributions from
6361	* the waking task p removed.	6471	* the waking task p removed.


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..c459a4b61544 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
85	SCHED_FEAT(WA_IDLE, true)	85	SCHED_FEAT(WA_IDLE, true)
86	SCHED_FEAT(WA_WEIGHT, true)	86	SCHED_FEAT(WA_WEIGHT, true)
87	SCHED_FEAT(WA_BIAS, true)	87	SCHED_FEAT(WA_BIAS, true)
		88
		89	/*
		90	* UtilEstimation. Use estimated CPU utilization.
		91	*/
		92	SCHED_FEAT(UTIL_EST, false)