summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrick Bellasi <patrick.bellasi@arm.com>2019-06-21 04:42:02 -0400
committerIngo Molnar <mingo@kernel.org>2019-06-24 13:23:44 -0400
commit69842cba9ace84849bb9b8edcdf2cefccd97901c (patch)
tree7d56d19500dc261558b8f4550119fc8950fac904
parenta3df067974c52df936f548ed218120f623c4c560 (diff)
sched/uclamp: Add CPU's clamp buckets refcounting
Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alessio Balsini <balsini@android.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/log2.h34
-rw-r--r--include/linux/sched.h39
-rw-r--r--include/linux/sched/topology.h6
-rw-r--r--init/Kconfig53
-rw-r--r--kernel/sched/core.c166
-rw-r--r--kernel/sched/sched.h51
6 files changed, 343 insertions, 6 deletions
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1aec01365ed4..83a4a3ca3e8a 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
220 ilog2((n) - 1) + 1) : \ 220 ilog2((n) - 1) + 1) : \
221 __order_base_2(n) \ 221 __order_base_2(n) \
222) 222)
223
224static inline __attribute__((const))
225int __bits_per(unsigned long n)
226{
227 if (n < 2)
228 return 1;
229 if (is_power_of_2(n))
230 return order_base_2(n) + 1;
231 return order_base_2(n);
232}
233
234/**
235 * bits_per - calculate the number of bits required for the argument
236 * @n: parameter
237 *
238 * This is constant-capable and can be used for compile time
239 * initializations, e.g bitfields.
240 *
241 * The first few values calculated by this routine:
242 * bf(0) = 1
243 * bf(1) = 1
244 * bf(2) = 2
245 * bf(3) = 2
246 * bf(4) = 3
247 * ... and so on.
248 */
249#define bits_per(n) \
250( \
251 __builtin_constant_p(n) ? ( \
252 ((n) == 0 || (n) == 1) \
253 ? 1 : ilog2(n) + 1 \
254 ) : \
255 __bits_per(n) \
256)
223#endif /* _LINUX_LOG2_H */ 257#endif /* _LINUX_LOG2_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 044c023875e8..80235bcd05f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -283,6 +283,18 @@ struct vtime {
283 u64 gtime; 283 u64 gtime;
284}; 284};
285 285
286/*
287 * Utilization clamp constraints.
288 * @UCLAMP_MIN: Minimum utilization
289 * @UCLAMP_MAX: Maximum utilization
290 * @UCLAMP_CNT: Utilization clamp constraints count
291 */
292enum uclamp_id {
293 UCLAMP_MIN = 0,
294 UCLAMP_MAX,
295 UCLAMP_CNT
296};
297
286struct sched_info { 298struct sched_info {
287#ifdef CONFIG_SCHED_INFO 299#ifdef CONFIG_SCHED_INFO
288 /* Cumulative counters: */ 300 /* Cumulative counters: */
@@ -314,6 +326,10 @@ struct sched_info {
314# define SCHED_FIXEDPOINT_SHIFT 10 326# define SCHED_FIXEDPOINT_SHIFT 10
315# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) 327# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
316 328
329/* Increase resolution of cpu_capacity calculations */
330# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
331# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
332
317struct load_weight { 333struct load_weight {
318 unsigned long weight; 334 unsigned long weight;
319 u32 inv_weight; 335 u32 inv_weight;
@@ -562,6 +578,25 @@ struct sched_dl_entity {
562 struct hrtimer inactive_timer; 578 struct hrtimer inactive_timer;
563}; 579};
564 580
581#ifdef CONFIG_UCLAMP_TASK
582/* Number of utilization clamp buckets (shorter alias) */
583#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
584
585/*
586 * Utilization clamp for a scheduling entity
587 * @value: clamp value "assigned" to a se
588 * @bucket_id: bucket index corresponding to the "assigned" value
589 *
590 * The bucket_id is the index of the clamp bucket matching the clamp value
591 * which is pre-computed and stored to avoid expensive integer divisions from
592 * the fast path.
593 */
594struct uclamp_se {
595 unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
596 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
597};
598#endif /* CONFIG_UCLAMP_TASK */
599
565union rcu_special { 600union rcu_special {
566 struct { 601 struct {
567 u8 blocked; 602 u8 blocked;
@@ -642,6 +677,10 @@ struct task_struct {
642#endif 677#endif
643 struct sched_dl_entity dl; 678 struct sched_dl_entity dl;
644 679
680#ifdef CONFIG_UCLAMP_TASK
681 struct uclamp_se uclamp[UCLAMP_CNT];
682#endif
683
645#ifdef CONFIG_PREEMPT_NOTIFIERS 684#ifdef CONFIG_PREEMPT_NOTIFIERS
646 /* List of struct preempt_notifier: */ 685 /* List of struct preempt_notifier: */
647 struct hlist_head preempt_notifiers; 686 struct hlist_head preempt_notifiers;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e445d3767cdd..7863bb62d2ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -7,12 +7,6 @@
7#include <linux/sched/idle.h> 7#include <linux/sched/idle.h>
8 8
9/* 9/*
10 * Increase resolution of cpu_capacity calculations
11 */
12#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
13#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
14
15/*
16 * sched-domains (multiprocessor balancing) declarations: 10 * sched-domains (multiprocessor balancing) declarations:
17 */ 11 */
18#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
diff --git a/init/Kconfig b/init/Kconfig
index 0e2344389501..c88289c18d59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
677config GENERIC_SCHED_CLOCK 677config GENERIC_SCHED_CLOCK
678 bool 678 bool
679 679
680menu "Scheduler features"
681
682config UCLAMP_TASK
683 bool "Enable utilization clamping for RT/FAIR tasks"
684 depends on CPU_FREQ_GOV_SCHEDUTIL
685 help
686 This feature enables the scheduler to track the clamped utilization
687 of each CPU based on RUNNABLE tasks scheduled on that CPU.
688
689 With this option, the user can specify the min and max CPU
690 utilization allowed for RUNNABLE tasks. The max utilization defines
691 the maximum frequency a task should use while the min utilization
692 defines the minimum frequency it should use.
693
694 Both min and max utilization clamp values are hints to the scheduler,
695 aiming at improving its frequency selection policy, but they do not
696 enforce or grant any specific bandwidth for tasks.
697
698 If in doubt, say N.
699
700config UCLAMP_BUCKETS_COUNT
701 int "Number of supported utilization clamp buckets"
702 range 5 20
703 default 5
704 depends on UCLAMP_TASK
705 help
706 Defines the number of clamp buckets to use. The range of each bucket
707 will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
708 number of clamp buckets the finer their granularity and the higher
709 the precision of clamping aggregation and tracking at run-time.
710
711 For example, with the minimum configuration value we will have 5
712 clamp buckets tracking 20% utilization each. A 25% boosted tasks will
713 be refcounted in the [20..39]% bucket and will set the bucket clamp
714 effective value to 25%.
715 If a second 30% boosted task should be co-scheduled on the same CPU,
716 that task will be refcounted in the same bucket of the first task and
717 it will boost the bucket clamp effective value to 30%.
718 The clamp effective value of a bucket is reset to its nominal value
719 (20% in the example above) when there are no more tasks refcounted in
720 that bucket.
721
722 An additional boost/capping margin can be added to some tasks. In the
723 example above the 25% task will be boosted to 30% until it exits the
724 CPU. If that should be considered not acceptable on certain systems,
725 it's always possible to reduce the margin by increasing the number of
726 clamp buckets to trade off used memory for run-time tracking
727 precision.
728
729 If in doubt, use the default value.
730
731endmenu
732
680# 733#
681# For architectures that want to enable the support for NUMA-affine scheduler 734# For architectures that want to enable the support for NUMA-affine scheduler
682# balancing logic: 735# balancing logic:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5e02d23e693..d8c1e67afd82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load)
772 } 772 }
773} 773}
774 774
775#ifdef CONFIG_UCLAMP_TASK
776
777/* Integer rounded range for each bucket */
778#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
779
780#define for_each_clamp_id(clamp_id) \
781 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
782
783static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
784{
785 return clamp_value / UCLAMP_BUCKET_DELTA;
786}
787
788static inline unsigned int uclamp_none(int clamp_id)
789{
790 if (clamp_id == UCLAMP_MIN)
791 return 0;
792 return SCHED_CAPACITY_SCALE;
793}
794
795static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
796{
797 uc_se->value = value;
798 uc_se->bucket_id = uclamp_bucket_id(value);
799}
800
801static inline
802unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id)
803{
804 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
805 int bucket_id = UCLAMP_BUCKETS - 1;
806
807 /*
808 * Since both min and max clamps are max aggregated, find the
809 * top most bucket with tasks in.
810 */
811 for ( ; bucket_id >= 0; bucket_id--) {
812 if (!bucket[bucket_id].tasks)
813 continue;
814 return bucket[bucket_id].value;
815 }
816
817 /* No tasks -- default clamp values */
818 return uclamp_none(clamp_id);
819}
820
821/*
822 * When a task is enqueued on a rq, the clamp bucket currently defined by the
823 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
824 * updates the rq's clamp value if required.
825 */
826static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
827 unsigned int clamp_id)
828{
829 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
830 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
831 struct uclamp_bucket *bucket;
832
833 lockdep_assert_held(&rq->lock);
834
835 bucket = &uc_rq->bucket[uc_se->bucket_id];
836 bucket->tasks++;
837
838 if (uc_se->value > READ_ONCE(uc_rq->value))
839 WRITE_ONCE(uc_rq->value, bucket->value);
840}
841
842/*
843 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
844 * is released. If this is the last task reference counting the rq's max
845 * active clamp value, then the rq's clamp value is updated.
846 *
847 * Both refcounted tasks and rq's cached clamp values are expected to be
848 * always valid. If it's detected they are not, as defensive programming,
849 * enforce the expected state and warn.
850 */
851static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
852 unsigned int clamp_id)
853{
854 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
855 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
856 struct uclamp_bucket *bucket;
857 unsigned int rq_clamp;
858
859 lockdep_assert_held(&rq->lock);
860
861 bucket = &uc_rq->bucket[uc_se->bucket_id];
862 SCHED_WARN_ON(!bucket->tasks);
863 if (likely(bucket->tasks))
864 bucket->tasks--;
865
866 if (likely(bucket->tasks))
867 return;
868
869 rq_clamp = READ_ONCE(uc_rq->value);
870 /*
871 * Defensive programming: this should never happen. If it happens,
872 * e.g. due to future modification, warn and fixup the expected value.
873 */
874 SCHED_WARN_ON(bucket->value > rq_clamp);
875 if (bucket->value >= rq_clamp)
876 WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id));
877}
878
879static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
880{
881 unsigned int clamp_id;
882
883 if (unlikely(!p->sched_class->uclamp_enabled))
884 return;
885
886 for_each_clamp_id(clamp_id)
887 uclamp_rq_inc_id(rq, p, clamp_id);
888}
889
890static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
891{
892 unsigned int clamp_id;
893
894 if (unlikely(!p->sched_class->uclamp_enabled))
895 return;
896
897 for_each_clamp_id(clamp_id)
898 uclamp_rq_dec_id(rq, p, clamp_id);
899}
900
901static void __init init_uclamp(void)
902{
903 unsigned int clamp_id;
904 int cpu;
905
906 for_each_possible_cpu(cpu) {
907 struct uclamp_bucket *bucket;
908 struct uclamp_rq *uc_rq;
909 unsigned int bucket_id;
910
911 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
912
913 for_each_clamp_id(clamp_id) {
914 uc_rq = &cpu_rq(cpu)->uclamp[clamp_id];
915
916 bucket_id = 1;
917 while (bucket_id < UCLAMP_BUCKETS) {
918 bucket = &uc_rq->bucket[bucket_id];
919 bucket->value = bucket_id * UCLAMP_BUCKET_DELTA;
920 ++bucket_id;
921 }
922 }
923 }
924
925 for_each_clamp_id(clamp_id) {
926 uclamp_se_set(&init_task.uclamp[clamp_id],
927 uclamp_none(clamp_id));
928 }
929}
930
931#else /* CONFIG_UCLAMP_TASK */
932static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
933static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
934static inline void init_uclamp(void) { }
935#endif /* CONFIG_UCLAMP_TASK */
936
775static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 937static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
776{ 938{
777 if (!(flags & ENQUEUE_NOCLOCK)) 939 if (!(flags & ENQUEUE_NOCLOCK))
@@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
782 psi_enqueue(p, flags & ENQUEUE_WAKEUP); 944 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
783 } 945 }
784 946
947 uclamp_rq_inc(rq, p);
785 p->sched_class->enqueue_task(rq, p, flags); 948 p->sched_class->enqueue_task(rq, p, flags);
786} 949}
787 950
@@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
795 psi_dequeue(p, flags & DEQUEUE_SLEEP); 958 psi_dequeue(p, flags & DEQUEUE_SLEEP);
796 } 959 }
797 960
961 uclamp_rq_dec(rq, p);
798 p->sched_class->dequeue_task(rq, p, flags); 962 p->sched_class->dequeue_task(rq, p, flags);
799} 963}
800 964
@@ -6093,6 +6257,8 @@ void __init sched_init(void)
6093 6257
6094 psi_init(); 6258 psi_init();
6095 6259
6260 init_uclamp();
6261
6096 scheduler_running = 1; 6262 scheduler_running = 1;
6097} 6263}
6098 6264
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58ab597ec88..cecc6baaba93 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
791#endif 791#endif
792#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
793 793
794#ifdef CONFIG_UCLAMP_TASK
795/*
796 * struct uclamp_bucket - Utilization clamp bucket
797 * @value: utilization clamp value for tasks on this clamp bucket
798 * @tasks: number of RUNNABLE tasks on this clamp bucket
799 *
800 * Keep track of how many tasks are RUNNABLE for a given utilization
801 * clamp value.
802 */
803struct uclamp_bucket {
804 unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
805 unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
806};
807
808/*
809 * struct uclamp_rq - rq's utilization clamp
810 * @value: currently active clamp values for a rq
811 * @bucket: utilization clamp buckets affecting a rq
812 *
813 * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
814 * A clamp value is affecting a rq when there is at least one task RUNNABLE
815 * (or actually running) with that value.
816 *
817 * There are up to UCLAMP_CNT possible different clamp values, currently there
818 * are only two: minimum utilization and maximum utilization.
819 *
820 * All utilization clamping values are MAX aggregated, since:
821 * - for util_min: we want to run the CPU at least at the max of the minimum
822 * utilization required by its currently RUNNABLE tasks.
823 * - for util_max: we want to allow the CPU to run up to the max of the
824 * maximum utilization allowed by its currently RUNNABLE tasks.
825 *
826 * Since on each system we expect only a limited number of different
827 * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
828 * the metrics required to compute all the per-rq utilization clamp values.
829 */
830struct uclamp_rq {
831 unsigned int value;
832 struct uclamp_bucket bucket[UCLAMP_BUCKETS];
833};
834#endif /* CONFIG_UCLAMP_TASK */
835
794/* 836/*
795 * This is the main, per-CPU runqueue data structure. 837 * This is the main, per-CPU runqueue data structure.
796 * 838 *
@@ -825,6 +867,11 @@ struct rq {
825 unsigned long nr_load_updates; 867 unsigned long nr_load_updates;
826 u64 nr_switches; 868 u64 nr_switches;
827 869
870#ifdef CONFIG_UCLAMP_TASK
871 /* Utilization clamp values based on CPU's RUNNABLE tasks */
872 struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
873#endif
874
828 struct cfs_rq cfs; 875 struct cfs_rq cfs;
829 struct rt_rq rt; 876 struct rt_rq rt;
830 struct dl_rq dl; 877 struct dl_rq dl;
@@ -1639,6 +1686,10 @@ extern const u32 sched_prio_to_wmult[40];
1639struct sched_class { 1686struct sched_class {
1640 const struct sched_class *next; 1687 const struct sched_class *next;
1641 1688
1689#ifdef CONFIG_UCLAMP_TASK
1690 int uclamp_enabled;
1691#endif
1692
1642 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1693 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1643 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1694 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1644 void (*yield_task) (struct rq *rq); 1695 void (*yield_task) (struct rq *rq);