summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h29
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c122
-rw-r--r--kernel/sched/features.h5
4 files changed, 154 insertions, 6 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21b1168da951..f228c6033832 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,6 +274,34 @@ struct load_weight {
274 u32 inv_weight; 274 u32 inv_weight;
275}; 275};
276 276
277/**
278 * struct util_est - Estimation utilization of FAIR tasks
279 * @enqueued: instantaneous estimated utilization of a task/cpu
280 * @ewma: the Exponential Weighted Moving Average (EWMA)
281 * utilization of a task
282 *
283 * Support data structure to track an Exponential Weighted Moving Average
284 * (EWMA) of a FAIR task's utilization. New samples are added to the moving
285 * average each time a task completes an activation. Sample's weight is chosen
286 * so that the EWMA will be relatively insensitive to transient changes to the
287 * task's workload.
288 *
289 * The enqueued attribute has a slightly different meaning for tasks and cpus:
290 * - task: the task's util_avg at last task dequeue time
291 * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
292 * Thus, the util_est.enqueued of a task represents the contribution on the
293 * estimated utilization of the CPU where that task is currently enqueued.
294 *
295 * Only for tasks we track a moving average of the past instantaneous
296 * estimated utilization. This allows to absorb sporadic drops in utilization
297 * of an otherwise almost periodic task.
298 */
299struct util_est {
300 unsigned int enqueued;
301 unsigned int ewma;
302#define UTIL_EST_WEIGHT_SHIFT 2
303};
304
277/* 305/*
278 * The load_avg/util_avg accumulates an infinite geometric series 306 * The load_avg/util_avg accumulates an infinite geometric series
279 * (see __update_load_avg() in kernel/sched/fair.c). 307 * (see __update_load_avg() in kernel/sched/fair.c).
@@ -335,6 +363,7 @@ struct sched_avg {
335 unsigned long load_avg; 363 unsigned long load_avg;
336 unsigned long runnable_load_avg; 364 unsigned long runnable_load_avg;
337 unsigned long util_avg; 365 unsigned long util_avg;
366 struct util_est util_est;
338}; 367};
339 368
340struct sched_statistics { 369struct sched_statistics {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 644d9a464380..332303be4beb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
541 cfs_rq->avg.runnable_load_avg); 541 cfs_rq->avg.runnable_load_avg);
542 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 542 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
543 cfs_rq->avg.util_avg); 543 cfs_rq->avg.util_avg);
544 SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
545 cfs_rq->avg.util_est.enqueued);
544 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", 546 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
545 cfs_rq->removed.load_avg); 547 cfs_rq->removed.load_avg);
546 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 548 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
989 P(se.avg.runnable_load_avg); 991 P(se.avg.runnable_load_avg);
990 P(se.avg.util_avg); 992 P(se.avg.util_avg);
991 P(se.avg.last_update_time); 993 P(se.avg.last_update_time);
994 P(se.avg.util_est.ewma);
995 P(se.avg.util_est.enqueued);
992#endif 996#endif
993 P(policy); 997 P(policy);
994 P(prio); 998 P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3582117e1580..22b59a7facd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3873 3873
3874static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3874static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3875 3875
3876static inline unsigned long task_util(struct task_struct *p)
3877{
3878 return READ_ONCE(p->se.avg.util_avg);
3879}
3880
3881static inline unsigned long _task_util_est(struct task_struct *p)
3882{
3883 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3884
3885 return max(ue.ewma, ue.enqueued);
3886}
3887
3888static inline unsigned long task_util_est(struct task_struct *p)
3889{
3890 return max(task_util(p), _task_util_est(p));
3891}
3892
3893static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3894 struct task_struct *p)
3895{
3896 unsigned int enqueued;
3897
3898 if (!sched_feat(UTIL_EST))
3899 return;
3900
3901 /* Update root cfs_rq's estimated utilization */
3902 enqueued = cfs_rq->avg.util_est.enqueued;
3903 enqueued += _task_util_est(p);
3904 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3905}
3906
3907/*
3908 * Check if a (signed) value is within a specified (unsigned) margin,
3909 * based on the observation that:
3910 *
3911 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3912 *
3913 * NOTE: this only works when value + maring < INT_MAX.
3914 */
3915static inline bool within_margin(int value, int margin)
3916{
3917 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3918}
3919
3920static void
3921util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3922{
3923 long last_ewma_diff;
3924 struct util_est ue;
3925
3926 if (!sched_feat(UTIL_EST))
3927 return;
3928
3929 /*
3930 * Update root cfs_rq's estimated utilization
3931 *
3932 * If *p is the last task then the root cfs_rq's estimated utilization
3933 * of a CPU is 0 by definition.
3934 */
3935 ue.enqueued = 0;
3936 if (cfs_rq->nr_running) {
3937 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3938 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3939 _task_util_est(p));
3940 }
3941 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3942
3943 /*
3944 * Skip update of task's estimated utilization when the task has not
3945 * yet completed an activation, e.g. being migrated.
3946 */
3947 if (!task_sleep)
3948 return;
3949
3950 /*
3951 * Skip update of task's estimated utilization when its EWMA is
3952 * already ~1% close to its last activation value.
3953 */
3954 ue = p->se.avg.util_est;
3955 ue.enqueued = task_util(p);
3956 last_ewma_diff = ue.enqueued - ue.ewma;
3957 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3958 return;
3959
3960 /*
3961 * Update Task's estimated utilization
3962 *
3963 * When *p completes an activation we can consolidate another sample
3964 * of the task size. This is done by storing the current PELT value
3965 * as ue.enqueued and by using this value to update the Exponential
3966 * Weighted Moving Average (EWMA):
3967 *
3968 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
3969 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
3970 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
3971 * = w * ( last_ewma_diff ) + ewma(t-1)
3972 * = w * (last_ewma_diff + ewma(t-1) / w)
3973 *
3974 * Where 'w' is the weight of new samples, which is configured to be
3975 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3976 */
3977 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3978 ue.ewma += last_ewma_diff;
3979 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3980 WRITE_ONCE(p->se.avg.util_est, ue);
3981}
3982
3876#else /* CONFIG_SMP */ 3983#else /* CONFIG_SMP */
3877 3984
3878static inline int 3985static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3902 return 0; 4009 return 0;
3903} 4010}
3904 4011
4012static inline void
4013util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4014
4015static inline void
4016util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4017 bool task_sleep) {}
4018
3905#endif /* CONFIG_SMP */ 4019#endif /* CONFIG_SMP */
3906 4020
3907static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 4021static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5249 if (!se) 5363 if (!se)
5250 add_nr_running(rq, 1); 5364 add_nr_running(rq, 1);
5251 5365
5366 util_est_enqueue(&rq->cfs, p);
5252 hrtick_update(rq); 5367 hrtick_update(rq);
5253} 5368}
5254 5369
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5308 if (!se) 5423 if (!se)
5309 sub_nr_running(rq, 1); 5424 sub_nr_running(rq, 1);
5310 5425
5426 util_est_dequeue(&rq->cfs, p, task_sleep);
5311 hrtick_update(rq); 5427 hrtick_update(rq);
5312} 5428}
5313 5429
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5835 return target; 5951 return target;
5836} 5952}
5837 5953
5838static inline unsigned long task_util(struct task_struct *p);
5839static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5954static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5840 5955
5841static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5956static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
6351 return (util >= capacity) ? capacity : util; 6466 return (util >= capacity) ? capacity : util;
6352} 6467}
6353 6468
6354static inline unsigned long task_util(struct task_struct *p)
6355{
6356 return p->se.avg.util_avg;
6357}
6358
6359/* 6469/*
6360 * cpu_util_wake: Compute CPU utilization with any contributions from 6470 * cpu_util_wake: Compute CPU utilization with any contributions from
6361 * the waking task p removed. 6471 * the waking task p removed.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..c459a4b61544 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
85SCHED_FEAT(WA_IDLE, true) 85SCHED_FEAT(WA_IDLE, true)
86SCHED_FEAT(WA_WEIGHT, true) 86SCHED_FEAT(WA_WEIGHT, true)
87SCHED_FEAT(WA_BIAS, true) 87SCHED_FEAT(WA_BIAS, true)
88
89/*
90 * UtilEstimation. Use estimated CPU utilization.
91 */
92SCHED_FEAT(UTIL_EST, false)