diff options
-rw-r--r-- | include/linux/sched.h | 29 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 122 | ||||
-rw-r--r-- | kernel/sched/features.h | 5 |
4 files changed, 154 insertions, 6 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 21b1168da951..f228c6033832 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -274,6 +274,34 @@ struct load_weight { | |||
274 | u32 inv_weight; | 274 | u32 inv_weight; |
275 | }; | 275 | }; |
276 | 276 | ||
277 | /** | ||
278 | * struct util_est - Estimation utilization of FAIR tasks | ||
279 | * @enqueued: instantaneous estimated utilization of a task/cpu | ||
280 | * @ewma: the Exponential Weighted Moving Average (EWMA) | ||
281 | * utilization of a task | ||
282 | * | ||
283 | * Support data structure to track an Exponential Weighted Moving Average | ||
284 | * (EWMA) of a FAIR task's utilization. New samples are added to the moving | ||
285 | * average each time a task completes an activation. Sample's weight is chosen | ||
286 | * so that the EWMA will be relatively insensitive to transient changes to the | ||
287 | * task's workload. | ||
288 | * | ||
289 | * The enqueued attribute has a slightly different meaning for tasks and cpus: | ||
290 | * - task: the task's util_avg at last task dequeue time | ||
291 | * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU | ||
292 | * Thus, the util_est.enqueued of a task represents the contribution on the | ||
293 | * estimated utilization of the CPU where that task is currently enqueued. | ||
294 | * | ||
295 | * Only for tasks we track a moving average of the past instantaneous | ||
296 | * estimated utilization. This allows to absorb sporadic drops in utilization | ||
297 | * of an otherwise almost periodic task. | ||
298 | */ | ||
299 | struct util_est { | ||
300 | unsigned int enqueued; | ||
301 | unsigned int ewma; | ||
302 | #define UTIL_EST_WEIGHT_SHIFT 2 | ||
303 | }; | ||
304 | |||
277 | /* | 305 | /* |
278 | * The load_avg/util_avg accumulates an infinite geometric series | 306 | * The load_avg/util_avg accumulates an infinite geometric series |
279 | * (see __update_load_avg() in kernel/sched/fair.c). | 307 | * (see __update_load_avg() in kernel/sched/fair.c). |
@@ -335,6 +363,7 @@ struct sched_avg { | |||
335 | unsigned long load_avg; | 363 | unsigned long load_avg; |
336 | unsigned long runnable_load_avg; | 364 | unsigned long runnable_load_avg; |
337 | unsigned long util_avg; | 365 | unsigned long util_avg; |
366 | struct util_est util_est; | ||
338 | }; | 367 | }; |
339 | 368 | ||
340 | struct sched_statistics { | 369 | struct sched_statistics { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 644d9a464380..332303be4beb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
541 | cfs_rq->avg.runnable_load_avg); | 541 | cfs_rq->avg.runnable_load_avg); |
542 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", | 542 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
543 | cfs_rq->avg.util_avg); | 543 | cfs_rq->avg.util_avg); |
544 | SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", | ||
545 | cfs_rq->avg.util_est.enqueued); | ||
544 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", | 546 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", |
545 | cfs_rq->removed.load_avg); | 547 | cfs_rq->removed.load_avg); |
546 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", | 548 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", |
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |||
989 | P(se.avg.runnable_load_avg); | 991 | P(se.avg.runnable_load_avg); |
990 | P(se.avg.util_avg); | 992 | P(se.avg.util_avg); |
991 | P(se.avg.last_update_time); | 993 | P(se.avg.last_update_time); |
994 | P(se.avg.util_est.ewma); | ||
995 | P(se.avg.util_est.enqueued); | ||
992 | #endif | 996 | #endif |
993 | P(policy); | 997 | P(policy); |
994 | P(prio); | 998 | P(prio); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3582117e1580..22b59a7facd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
3873 | 3873 | ||
3874 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); | 3874 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
3875 | 3875 | ||
3876 | static inline unsigned long task_util(struct task_struct *p) | ||
3877 | { | ||
3878 | return READ_ONCE(p->se.avg.util_avg); | ||
3879 | } | ||
3880 | |||
3881 | static inline unsigned long _task_util_est(struct task_struct *p) | ||
3882 | { | ||
3883 | struct util_est ue = READ_ONCE(p->se.avg.util_est); | ||
3884 | |||
3885 | return max(ue.ewma, ue.enqueued); | ||
3886 | } | ||
3887 | |||
3888 | static inline unsigned long task_util_est(struct task_struct *p) | ||
3889 | { | ||
3890 | return max(task_util(p), _task_util_est(p)); | ||
3891 | } | ||
3892 | |||
3893 | static inline void util_est_enqueue(struct cfs_rq *cfs_rq, | ||
3894 | struct task_struct *p) | ||
3895 | { | ||
3896 | unsigned int enqueued; | ||
3897 | |||
3898 | if (!sched_feat(UTIL_EST)) | ||
3899 | return; | ||
3900 | |||
3901 | /* Update root cfs_rq's estimated utilization */ | ||
3902 | enqueued = cfs_rq->avg.util_est.enqueued; | ||
3903 | enqueued += _task_util_est(p); | ||
3904 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); | ||
3905 | } | ||
3906 | |||
3907 | /* | ||
3908 | * Check if a (signed) value is within a specified (unsigned) margin, | ||
3909 | * based on the observation that: | ||
3910 | * | ||
3911 | * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) | ||
3912 | * | ||
3913 | * NOTE: this only works when value + maring < INT_MAX. | ||
3914 | */ | ||
3915 | static inline bool within_margin(int value, int margin) | ||
3916 | { | ||
3917 | return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); | ||
3918 | } | ||
3919 | |||
3920 | static void | ||
3921 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | ||
3922 | { | ||
3923 | long last_ewma_diff; | ||
3924 | struct util_est ue; | ||
3925 | |||
3926 | if (!sched_feat(UTIL_EST)) | ||
3927 | return; | ||
3928 | |||
3929 | /* | ||
3930 | * Update root cfs_rq's estimated utilization | ||
3931 | * | ||
3932 | * If *p is the last task then the root cfs_rq's estimated utilization | ||
3933 | * of a CPU is 0 by definition. | ||
3934 | */ | ||
3935 | ue.enqueued = 0; | ||
3936 | if (cfs_rq->nr_running) { | ||
3937 | ue.enqueued = cfs_rq->avg.util_est.enqueued; | ||
3938 | ue.enqueued -= min_t(unsigned int, ue.enqueued, | ||
3939 | _task_util_est(p)); | ||
3940 | } | ||
3941 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); | ||
3942 | |||
3943 | /* | ||
3944 | * Skip update of task's estimated utilization when the task has not | ||
3945 | * yet completed an activation, e.g. being migrated. | ||
3946 | */ | ||
3947 | if (!task_sleep) | ||
3948 | return; | ||
3949 | |||
3950 | /* | ||
3951 | * Skip update of task's estimated utilization when its EWMA is | ||
3952 | * already ~1% close to its last activation value. | ||
3953 | */ | ||
3954 | ue = p->se.avg.util_est; | ||
3955 | ue.enqueued = task_util(p); | ||
3956 | last_ewma_diff = ue.enqueued - ue.ewma; | ||
3957 | if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) | ||
3958 | return; | ||
3959 | |||
3960 | /* | ||
3961 | * Update Task's estimated utilization | ||
3962 | * | ||
3963 | * When *p completes an activation we can consolidate another sample | ||
3964 | * of the task size. This is done by storing the current PELT value | ||
3965 | * as ue.enqueued and by using this value to update the Exponential | ||
3966 | * Weighted Moving Average (EWMA): | ||
3967 | * | ||
3968 | * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) | ||
3969 | * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) | ||
3970 | * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) | ||
3971 | * = w * ( last_ewma_diff ) + ewma(t-1) | ||
3972 | * = w * (last_ewma_diff + ewma(t-1) / w) | ||
3973 | * | ||
3974 | * Where 'w' is the weight of new samples, which is configured to be | ||
3975 | * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) | ||
3976 | */ | ||
3977 | ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; | ||
3978 | ue.ewma += last_ewma_diff; | ||
3979 | ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; | ||
3980 | WRITE_ONCE(p->se.avg.util_est, ue); | ||
3981 | } | ||
3982 | |||
3876 | #else /* CONFIG_SMP */ | 3983 | #else /* CONFIG_SMP */ |
3877 | 3984 | ||
3878 | static inline int | 3985 | static inline int |
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) | |||
3902 | return 0; | 4009 | return 0; |
3903 | } | 4010 | } |
3904 | 4011 | ||
4012 | static inline void | ||
4013 | util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} | ||
4014 | |||
4015 | static inline void | ||
4016 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, | ||
4017 | bool task_sleep) {} | ||
4018 | |||
3905 | #endif /* CONFIG_SMP */ | 4019 | #endif /* CONFIG_SMP */ |
3906 | 4020 | ||
3907 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 4021 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5249 | if (!se) | 5363 | if (!se) |
5250 | add_nr_running(rq, 1); | 5364 | add_nr_running(rq, 1); |
5251 | 5365 | ||
5366 | util_est_enqueue(&rq->cfs, p); | ||
5252 | hrtick_update(rq); | 5367 | hrtick_update(rq); |
5253 | } | 5368 | } |
5254 | 5369 | ||
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5308 | if (!se) | 5423 | if (!se) |
5309 | sub_nr_running(rq, 1); | 5424 | sub_nr_running(rq, 1); |
5310 | 5425 | ||
5426 | util_est_dequeue(&rq->cfs, p, task_sleep); | ||
5311 | hrtick_update(rq); | 5427 | hrtick_update(rq); |
5312 | } | 5428 | } |
5313 | 5429 | ||
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5835 | return target; | 5951 | return target; |
5836 | } | 5952 | } |
5837 | 5953 | ||
5838 | static inline unsigned long task_util(struct task_struct *p); | ||
5839 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); | 5954 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); |
5840 | 5955 | ||
5841 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | 5956 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) |
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu) | |||
6351 | return (util >= capacity) ? capacity : util; | 6466 | return (util >= capacity) ? capacity : util; |
6352 | } | 6467 | } |
6353 | 6468 | ||
6354 | static inline unsigned long task_util(struct task_struct *p) | ||
6355 | { | ||
6356 | return p->se.avg.util_avg; | ||
6357 | } | ||
6358 | |||
6359 | /* | 6469 | /* |
6360 | * cpu_util_wake: Compute CPU utilization with any contributions from | 6470 | * cpu_util_wake: Compute CPU utilization with any contributions from |
6361 | * the waking task p removed. | 6471 | * the waking task p removed. |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..c459a4b61544 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) | |||
85 | SCHED_FEAT(WA_IDLE, true) | 85 | SCHED_FEAT(WA_IDLE, true) |
86 | SCHED_FEAT(WA_WEIGHT, true) | 86 | SCHED_FEAT(WA_WEIGHT, true) |
87 | SCHED_FEAT(WA_BIAS, true) | 87 | SCHED_FEAT(WA_BIAS, true) |
88 | |||
89 | /* | ||
90 | * UtilEstimation. Use estimated CPU utilization. | ||
91 | */ | ||
92 | SCHED_FEAT(UTIL_EST, false) | ||