sched/uclamp: Add CPU's clamp buckets refcounting

Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alessio Balsini <balsini@android.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Patrick Bellasi <patrick.bellasi@arm.com> 2019-06-21 04:42:02 -0400
committer: Ingo Molnar <mingo@kernel.org> 2019-06-24 13:23:44 -0400
commit: 69842cba9ace84849bb9b8edcdf2cefccd97901c (patch)
tree: 7d56d19500dc261558b8f4550119fc8950fac904
parent: a3df067974c52df936f548ed218120f623c4c560 (diff)
6 files changed, 343 insertions, 6 deletions
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1aec01365ed4..83a4a3ca3e8a 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
                ilog2((n) - 1) + 1) :           \
        __order_base_2(n)                       \
 )
+static inline __attribute__((const))
+int __bits_per(unsigned long n)
+{
+        if (n < 2)
+                return 1;
+        if (is_power_of_2(n))
+                return order_base_2(n) + 1;
+        return order_base_2(n);
+}
+/**
+ * bits_per - calculate the number of bits required for the argument
+ * @n: parameter
+ *
+ * This is constant-capable and can be used for compile time
+ * initializations, e.g bitfields.
+ *
+ * The first few values calculated by this routine:
+ * bf(0) = 1
+ * bf(1) = 1
+ * bf(2) = 2
+ * bf(3) = 2
+ * bf(4) = 3
+ * ... and so on.
+ */
+#define bits_per(n)                             \
+(                                               \
+        __builtin_constant_p(n) ? (             \
+                ((n) == 0 || (n) == 1)          \
+                        ? 1 : ilog2(n) + 1      \
+        ) :                                     \
+        __bits_per(n)                           \
+)
 #endif /* _LINUX_LOG2_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 044c023875e8..80235bcd05f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -283,6 +283,18 @@ struct vtime {
        u64                     gtime;
 };
+/*
+ * Utilization clamp constraints.
+ * @UCLAMP_MIN: Minimum utilization
+ * @UCLAMP_MAX: Maximum utilization
+ * @UCLAMP_CNT: Utilization clamp constraints count
+ */
+enum uclamp_id {
+        UCLAMP_MIN = 0,
+        UCLAMP_MAX,
+        UCLAMP_CNT
+};
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@ -314,6 +326,10 @@ struct sched_info {
 # define SCHED_FIXEDPOINT_SHIFT         10
 # define SCHED_FIXEDPOINT_SCALE         (1L << SCHED_FIXEDPOINT_SHIFT)
+/* Increase resolution of cpu_capacity calculations */
+# define SCHED_CAPACITY_SHIFT           SCHED_FIXEDPOINT_SHIFT
+# define SCHED_CAPACITY_SCALE           (1L << SCHED_CAPACITY_SHIFT)
 struct load_weight {
        unsigned long                   weight;
        u32                             inv_weight;
@@ -562,6 +578,25 @@ struct sched_dl_entity {
        struct hrtimer inactive_timer;
 };
+#ifdef CONFIG_UCLAMP_TASK
+/* Number of utilization clamp buckets (shorter alias) */
+#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
+/*
+ * Utilization clamp for a scheduling entity
+ * @value:              clamp value "assigned" to a se
+ * @bucket_id:          bucket index corresponding to the "assigned" value
+ *
+ * The bucket_id is the index of the clamp bucket matching the clamp value
+ * which is pre-computed and stored to avoid expensive integer divisions from
+ * the fast path.
+ */
+struct uclamp_se {
+        unsigned int value              : bits_per(SCHED_CAPACITY_SCALE);
+        unsigned int bucket_id          : bits_per(UCLAMP_BUCKETS);
+};
+#endif /* CONFIG_UCLAMP_TASK */
 union rcu_special {
        struct {
                u8                      blocked;
@@ -642,6 +677,10 @@ struct task_struct {
 #endif
        struct sched_dl_entity          dl;
+#ifdef CONFIG_UCLAMP_TASK
+        struct uclamp_se                uclamp[UCLAMP_CNT];
+#endif
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head               preempt_notifiers;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e445d3767cdd..7863bb62d2ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -7,12 +7,6 @@
 #include <linux/sched/idle.h>
 /*
- * Increase resolution of cpu_capacity calculations
- */
-#define SCHED_CAPACITY_SHIFT    SCHED_FIXEDPOINT_SHIFT
-#define SCHED_CAPACITY_SCALE    (1L << SCHED_CAPACITY_SHIFT)
-/*
 * sched-domains (multiprocessor balancing) declarations:
 */
 #ifdef CONFIG_SMP
diff --git a/init/Kconfig b/init/Kconfig
index 0e2344389501..c88289c18d59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 config GENERIC_SCHED_CLOCK
        bool
+menu "Scheduler features"
+config UCLAMP_TASK
+        bool "Enable utilization clamping for RT/FAIR tasks"
+        depends on CPU_FREQ_GOV_SCHEDUTIL
+        help
+          This feature enables the scheduler to track the clamped utilization
+          of each CPU based on RUNNABLE tasks scheduled on that CPU.
+          With this option, the user can specify the min and max CPU
+          utilization allowed for RUNNABLE tasks. The max utilization defines
+          the maximum frequency a task should use while the min utilization
+          defines the minimum frequency it should use.
+          Both min and max utilization clamp values are hints to the scheduler,
+          aiming at improving its frequency selection policy, but they do not
+          enforce or grant any specific bandwidth for tasks.
+          If in doubt, say N.
+config UCLAMP_BUCKETS_COUNT
+        int "Number of supported utilization clamp buckets"
+        range 5 20
+        default 5
+        depends on UCLAMP_TASK
+        help
+          Defines the number of clamp buckets to use. The range of each bucket
+          will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
+          number of clamp buckets the finer their granularity and the higher
+          the precision of clamping aggregation and tracking at run-time.
+          For example, with the minimum configuration value we will have 5
+          clamp buckets tracking 20% utilization each. A 25% boosted tasks will
+          be refcounted in the [20..39]% bucket and will set the bucket clamp
+          effective value to 25%.
+          If a second 30% boosted task should be co-scheduled on the same CPU,
+          that task will be refcounted in the same bucket of the first task and
+          it will boost the bucket clamp effective value to 30%.
+          The clamp effective value of a bucket is reset to its nominal value
+          (20% in the example above) when there are no more tasks refcounted in
+          that bucket.
+          An additional boost/capping margin can be added to some tasks. In the
+          example above the 25% task will be boosted to 30% until it exits the
+          CPU. If that should be considered not acceptable on certain systems,
+          it's always possible to reduce the margin by increasing the number of
+          clamp buckets to trade off used memory for run-time tracking
+          precision.
+          If in doubt, use the default value.
+endmenu
 #
 # For architectures that want to enable the support for NUMA-affine scheduler
 # balancing logic:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5e02d23e693..d8c1e67afd82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load)
        }
 }
+#ifdef CONFIG_UCLAMP_TASK
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+#define for_each_clamp_id(clamp_id) \
+        for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+        return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+static inline unsigned int uclamp_none(int clamp_id)
+{
+        if (clamp_id == UCLAMP_MIN)
+                return 0;
+        return SCHED_CAPACITY_SCALE;
+}
+static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
+{
+        uc_se->value = value;
+        uc_se->bucket_id = uclamp_bucket_id(value);
+}
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id)
+{
+        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+        int bucket_id = UCLAMP_BUCKETS - 1;
+        /*
+         * Since both min and max clamps are max aggregated, find the
+         * top most bucket with tasks in.
+         */
+        for ( ; bucket_id >= 0; bucket_id--) {
+                if (!bucket[bucket_id].tasks)
+                        continue;
+                return bucket[bucket_id].value;
+        }
+        /* No tasks -- default clamp values */
+        return uclamp_none(clamp_id);
+}
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        lockdep_assert_held(&rq->lock);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        bucket->tasks++;
+        if (uc_se->value > READ_ONCE(uc_rq->value))
+                WRITE_ONCE(uc_rq->value, bucket->value);
+}
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        unsigned int rq_clamp;
+        lockdep_assert_held(&rq->lock);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        SCHED_WARN_ON(!bucket->tasks);
+        if (likely(bucket->tasks))
+                bucket->tasks--;
+        if (likely(bucket->tasks))
+                return;
+        rq_clamp = READ_ONCE(uc_rq->value);
+        /*
+         * Defensive programming: this should never happen. If it happens,
+         * e.g. due to future modification, warn and fixup the expected value.
+         */
+        SCHED_WARN_ON(bucket->value > rq_clamp);
+        if (bucket->value >= rq_clamp)
+                WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id));
+}
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_inc_id(rq, p, clamp_id);
+}
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_dec_id(rq, p, clamp_id);
+}
+static void __init init_uclamp(void)
+{
+        unsigned int clamp_id;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct uclamp_bucket *bucket;
+                struct uclamp_rq *uc_rq;
+                unsigned int bucket_id;
+                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+                for_each_clamp_id(clamp_id) {
+                        uc_rq = &cpu_rq(cpu)->uclamp[clamp_id];
+                        bucket_id = 1;
+                        while (bucket_id < UCLAMP_BUCKETS) {
+                                bucket = &uc_rq->bucket[bucket_id];
+                                bucket->value = bucket_id * UCLAMP_BUCKET_DELTA;
+                                ++bucket_id;
+                        }
+                }
+        }
+        for_each_clamp_id(clamp_id) {
+                uclamp_se_set(&init_task.uclamp[clamp_id],
+                              uclamp_none(clamp_id));
+        }
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (!(flags & ENQUEUE_NOCLOCK))
@@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
        }
+        uclamp_rq_inc(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
@@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_dequeue(p, flags & DEQUEUE_SLEEP);
        }
+        uclamp_rq_dec(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -6093,6 +6257,8 @@ void __init sched_init(void)
        psi_init();
+        init_uclamp();
        scheduler_running = 1;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58ab597ec88..cecc6baaba93 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
 #endif
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+        unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+        unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ *   utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ *   maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+        unsigned int value;
+        struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
 /*
 * This is the main, per-CPU runqueue data structure.
 *
@@ -825,6 +867,11 @@ struct rq {
        unsigned long           nr_load_updates;
        u64                     nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+        /* Utilization clamp values based on CPU's RUNNABLE tasks */
+        struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
+#endif
        struct cfs_rq           cfs;
        struct rt_rq            rt;
        struct dl_rq            dl;
@@ -1639,6 +1686,10 @@ extern const u32		sched_prio_to_wmult[40];
 struct sched_class {
        const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+        int uclamp_enabled;
+#endif
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task)   (struct rq *rq);
author	Patrick Bellasi <patrick.bellasi@arm.com>	2019-06-21 04:42:02 -0400
committer	Ingo Molnar <mingo@kernel.org>	2019-06-24 13:23:44 -0400
commit	69842cba9ace84849bb9b8edcdf2cefccd97901c (patch)
tree	7d56d19500dc261558b8f4550119fc8950fac904
parent	a3df067974c52df936f548ed218120f623c4c560 (diff)

diff --git a/include/linux/log2.h b/include/linux/log2.h index 1aec01365ed4..83a4a3ca3e8a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
220	ilog2((n) - 1) + 1) : \	220	ilog2((n) - 1) + 1) : \
221	__order_base_2(n) \	221	__order_base_2(n) \
222	)	222	)
		223
		224	static inline __attribute__((const))
		225	int __bits_per(unsigned long n)
		226	{
		227	if (n < 2)
		228	return 1;
		229	if (is_power_of_2(n))
		230	return order_base_2(n) + 1;
		231	return order_base_2(n);
		232	}
		233
		234	/**
		235	* bits_per - calculate the number of bits required for the argument
		236	* @n: parameter
		237	*
		238	* This is constant-capable and can be used for compile time
		239	* initializations, e.g bitfields.
		240	*
		241	* The first few values calculated by this routine:
		242	* bf(0) = 1
		243	* bf(1) = 1
		244	* bf(2) = 2
		245	* bf(3) = 2
		246	* bf(4) = 3
		247	* ... and so on.
		248	*/
		249	#define bits_per(n) \
		250	( \
		251	__builtin_constant_p(n) ? ( \
		252	((n) == 0 \|\| (n) == 1) \
		253	? 1 : ilog2(n) + 1 \
		254	) : \
		255	__bits_per(n) \
		256	)
223	#endif /* _LINUX_LOG2_H */	257	#endif /* _LINUX_LOG2_H */


diff --git a/include/linux/sched.h b/include/linux/sched.h index 044c023875e8..80235bcd05f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -283,6 +283,18 @@ struct vtime {
283	u64 gtime;	283	u64 gtime;
284	};	284	};
285		285
		286	/*
		287	* Utilization clamp constraints.
		288	* @UCLAMP_MIN: Minimum utilization
		289	* @UCLAMP_MAX: Maximum utilization
		290	* @UCLAMP_CNT: Utilization clamp constraints count
		291	*/
		292	enum uclamp_id {
		293	UCLAMP_MIN = 0,
		294	UCLAMP_MAX,
		295	UCLAMP_CNT
		296	};
		297
286	struct sched_info {	298	struct sched_info {
287	#ifdef CONFIG_SCHED_INFO	299	#ifdef CONFIG_SCHED_INFO
288	/* Cumulative counters: */	300	/* Cumulative counters: */
@@ -314,6 +326,10 @@ struct sched_info {
314	# define SCHED_FIXEDPOINT_SHIFT 10	326	# define SCHED_FIXEDPOINT_SHIFT 10
315	# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)	327	# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
316		328
		329	/* Increase resolution of cpu_capacity calculations */
		330	# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
		331	# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
		332
317	struct load_weight {	333	struct load_weight {
318	unsigned long weight;	334	unsigned long weight;
319	u32 inv_weight;	335	u32 inv_weight;
@@ -562,6 +578,25 @@ struct sched_dl_entity {
562	struct hrtimer inactive_timer;	578	struct hrtimer inactive_timer;
563	};	579	};
564		580
		581	#ifdef CONFIG_UCLAMP_TASK
		582	/* Number of utilization clamp buckets (shorter alias) */
		583	#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
		584
		585	/*
		586	* Utilization clamp for a scheduling entity
		587	* @value: clamp value "assigned" to a se
		588	* @bucket_id: bucket index corresponding to the "assigned" value
		589	*
		590	* The bucket_id is the index of the clamp bucket matching the clamp value
		591	* which is pre-computed and stored to avoid expensive integer divisions from
		592	* the fast path.
		593	*/
		594	struct uclamp_se {
		595	unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
		596	unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
		597	};
		598	#endif /* CONFIG_UCLAMP_TASK */
		599
565	union rcu_special {	600	union rcu_special {
566	struct {	601	struct {
567	u8 blocked;	602	u8 blocked;
@@ -642,6 +677,10 @@ struct task_struct {
642	#endif	677	#endif
643	struct sched_dl_entity dl;	678	struct sched_dl_entity dl;
644		679
		680	#ifdef CONFIG_UCLAMP_TASK
		681	struct uclamp_se uclamp[UCLAMP_CNT];
		682	#endif
		683
645	#ifdef CONFIG_PREEMPT_NOTIFIERS	684	#ifdef CONFIG_PREEMPT_NOTIFIERS
646	/* List of struct preempt_notifier: */	685	/* List of struct preempt_notifier: */
647	struct hlist_head preempt_notifiers;	686	struct hlist_head preempt_notifiers;


diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e445d3767cdd..7863bb62d2ab 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h
@@ -7,12 +7,6 @@
7	#include <linux/sched/idle.h>	7	#include <linux/sched/idle.h>
8		8
9	/*	9	/*
10	* Increase resolution of cpu_capacity calculations
11	*/
12	#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
13	#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
14
15	/*
16	* sched-domains (multiprocessor balancing) declarations:	10	* sched-domains (multiprocessor balancing) declarations:
17	*/	11	*/
18	#ifdef CONFIG_SMP	12	#ifdef CONFIG_SMP


diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..c88289c18d59 100644 --- a/init/Kconfig +++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
677	config GENERIC_SCHED_CLOCK	677	config GENERIC_SCHED_CLOCK
678	bool	678	bool
679		679
		680	menu "Scheduler features"
		681
		682	config UCLAMP_TASK
		683	bool "Enable utilization clamping for RT/FAIR tasks"
		684	depends on CPU_FREQ_GOV_SCHEDUTIL
		685	help
		686	This feature enables the scheduler to track the clamped utilization
		687	of each CPU based on RUNNABLE tasks scheduled on that CPU.
		688
		689	With this option, the user can specify the min and max CPU
		690	utilization allowed for RUNNABLE tasks. The max utilization defines
		691	the maximum frequency a task should use while the min utilization
		692	defines the minimum frequency it should use.
		693
		694	Both min and max utilization clamp values are hints to the scheduler,
		695	aiming at improving its frequency selection policy, but they do not
		696	enforce or grant any specific bandwidth for tasks.
		697
		698	If in doubt, say N.
		699
		700	config UCLAMP_BUCKETS_COUNT
		701	int "Number of supported utilization clamp buckets"
		702	range 5 20
		703	default 5
		704	depends on UCLAMP_TASK
		705	help
		706	Defines the number of clamp buckets to use. The range of each bucket
		707	will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
		708	number of clamp buckets the finer their granularity and the higher
		709	the precision of clamping aggregation and tracking at run-time.
		710
		711	For example, with the minimum configuration value we will have 5
		712	clamp buckets tracking 20% utilization each. A 25% boosted tasks will
		713	be refcounted in the [20..39]% bucket and will set the bucket clamp
		714	effective value to 25%.
		715	If a second 30% boosted task should be co-scheduled on the same CPU,
		716	that task will be refcounted in the same bucket of the first task and
		717	it will boost the bucket clamp effective value to 30%.
		718	The clamp effective value of a bucket is reset to its nominal value
		719	(20% in the example above) when there are no more tasks refcounted in
		720	that bucket.
		721
		722	An additional boost/capping margin can be added to some tasks. In the
		723	example above the 25% task will be boosted to 30% until it exits the
		724	CPU. If that should be considered not acceptable on certain systems,
		725	it's always possible to reduce the margin by increasing the number of
		726	clamp buckets to trade off used memory for run-time tracking
		727	precision.
		728
		729	If in doubt, use the default value.
		730
		731	endmenu
		732
680	#	733	#
681	# For architectures that want to enable the support for NUMA-affine scheduler	734	# For architectures that want to enable the support for NUMA-affine scheduler
682	# balancing logic:	735	# balancing logic:


diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5e02d23e693..d8c1e67afd82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load)
772	}	772	}
773	}	773	}
774		774
		775	#ifdef CONFIG_UCLAMP_TASK
		776
		777	/* Integer rounded range for each bucket */
		778	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
		779
		780	#define for_each_clamp_id(clamp_id) \
		781	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
		782
		783	static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
		784	{
		785	return clamp_value / UCLAMP_BUCKET_DELTA;
		786	}
		787
		788	static inline unsigned int uclamp_none(int clamp_id)
		789	{
		790	if (clamp_id == UCLAMP_MIN)
		791	return 0;
		792	return SCHED_CAPACITY_SCALE;
		793	}
		794
		795	static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value)
		796	{
		797	uc_se->value = value;
		798	uc_se->bucket_id = uclamp_bucket_id(value);
		799	}
		800
		801	static inline
		802	unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id)
		803	{
		804	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
		805	int bucket_id = UCLAMP_BUCKETS - 1;
		806
		807	/*
		808	* Since both min and max clamps are max aggregated, find the
		809	* top most bucket with tasks in.
		810	*/
		811	for ( ; bucket_id >= 0; bucket_id--) {
		812	if (!bucket[bucket_id].tasks)
		813	continue;
		814	return bucket[bucket_id].value;
		815	}
		816
		817	/* No tasks -- default clamp values */
		818	return uclamp_none(clamp_id);
		819	}
		820
		821	/*
		822	* When a task is enqueued on a rq, the clamp bucket currently defined by the
		823	* task's uclamp::bucket_id is refcounted on that rq. This also immediately
		824	* updates the rq's clamp value if required.
		825	*/
		826	static inline void uclamp_rq_inc_id(struct rq rq, struct task_struct p,
		827	unsigned int clamp_id)
		828	{
		829	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
		830	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
		831	struct uclamp_bucket *bucket;
		832
		833	lockdep_assert_held(&rq->lock);
		834
		835	bucket = &uc_rq->bucket[uc_se->bucket_id];
		836	bucket->tasks++;
		837
		838	if (uc_se->value > READ_ONCE(uc_rq->value))
		839	WRITE_ONCE(uc_rq->value, bucket->value);
		840	}
		841
		842	/*
		843	* When a task is dequeued from a rq, the clamp bucket refcounted by the task
		844	* is released. If this is the last task reference counting the rq's max
		845	* active clamp value, then the rq's clamp value is updated.
		846	*
		847	* Both refcounted tasks and rq's cached clamp values are expected to be
		848	* always valid. If it's detected they are not, as defensive programming,
		849	* enforce the expected state and warn.
		850	*/
		851	static inline void uclamp_rq_dec_id(struct rq rq, struct task_struct p,
		852	unsigned int clamp_id)
		853	{
		854	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
		855	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
		856	struct uclamp_bucket *bucket;
		857	unsigned int rq_clamp;
		858
		859	lockdep_assert_held(&rq->lock);
		860
		861	bucket = &uc_rq->bucket[uc_se->bucket_id];
		862	SCHED_WARN_ON(!bucket->tasks);
		863	if (likely(bucket->tasks))
		864	bucket->tasks--;
		865
		866	if (likely(bucket->tasks))
		867	return;
		868
		869	rq_clamp = READ_ONCE(uc_rq->value);
		870	/*
		871	* Defensive programming: this should never happen. If it happens,
		872	* e.g. due to future modification, warn and fixup the expected value.
		873	*/
		874	SCHED_WARN_ON(bucket->value > rq_clamp);
		875	if (bucket->value >= rq_clamp)
		876	WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id));
		877	}
		878
		879	static inline void uclamp_rq_inc(struct rq rq, struct task_struct p)
		880	{
		881	unsigned int clamp_id;
		882
		883	if (unlikely(!p->sched_class->uclamp_enabled))
		884	return;
		885
		886	for_each_clamp_id(clamp_id)
		887	uclamp_rq_inc_id(rq, p, clamp_id);
		888	}
		889
		890	static inline void uclamp_rq_dec(struct rq rq, struct task_struct p)
		891	{
		892	unsigned int clamp_id;
		893
		894	if (unlikely(!p->sched_class->uclamp_enabled))
		895	return;
		896
		897	for_each_clamp_id(clamp_id)
		898	uclamp_rq_dec_id(rq, p, clamp_id);
		899	}
		900
		901	static void __init init_uclamp(void)
		902	{
		903	unsigned int clamp_id;
		904	int cpu;
		905
		906	for_each_possible_cpu(cpu) {
		907	struct uclamp_bucket *bucket;
		908	struct uclamp_rq *uc_rq;
		909	unsigned int bucket_id;
		910
		911	memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
		912
		913	for_each_clamp_id(clamp_id) {
		914	uc_rq = &cpu_rq(cpu)->uclamp[clamp_id];
		915
		916	bucket_id = 1;
		917	while (bucket_id < UCLAMP_BUCKETS) {
		918	bucket = &uc_rq->bucket[bucket_id];
		919	bucket->value = bucket_id * UCLAMP_BUCKET_DELTA;
		920	++bucket_id;
		921	}
		922	}
		923	}
		924
		925	for_each_clamp_id(clamp_id) {
		926	uclamp_se_set(&init_task.uclamp[clamp_id],
		927	uclamp_none(clamp_id));
		928	}
		929	}
		930
		931	#else /* CONFIG_UCLAMP_TASK */
		932	static inline void uclamp_rq_inc(struct rq rq, struct task_struct p) { }
		933	static inline void uclamp_rq_dec(struct rq rq, struct task_struct p) { }
		934	static inline void init_uclamp(void) { }
		935	#endif /* CONFIG_UCLAMP_TASK */
		936
775	static inline void enqueue_task(struct rq rq, struct task_struct p, int flags)	937	static inline void enqueue_task(struct rq rq, struct task_struct p, int flags)
776	{	938	{
777	if (!(flags & ENQUEUE_NOCLOCK))	939	if (!(flags & ENQUEUE_NOCLOCK))
@@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq rq, struct task_struct p, int flags)
782	psi_enqueue(p, flags & ENQUEUE_WAKEUP);	944	psi_enqueue(p, flags & ENQUEUE_WAKEUP);
783	}	945	}
784		946
		947	uclamp_rq_inc(rq, p);
785	p->sched_class->enqueue_task(rq, p, flags);	948	p->sched_class->enqueue_task(rq, p, flags);
786	}	949	}
787		950
@@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq rq, struct task_struct p, int flags)
795	psi_dequeue(p, flags & DEQUEUE_SLEEP);	958	psi_dequeue(p, flags & DEQUEUE_SLEEP);
796	}	959	}
797		960
		961	uclamp_rq_dec(rq, p);
798	p->sched_class->dequeue_task(rq, p, flags);	962	p->sched_class->dequeue_task(rq, p, flags);
799	}	963	}
800		964
@@ -6093,6 +6257,8 @@ void __init sched_init(void)
6093		6257
6094	psi_init();	6258	psi_init();
6095		6259
		6260	init_uclamp();
		6261
6096	scheduler_running = 1;	6262	scheduler_running = 1;
6097	}	6263	}
6098		6264


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e58ab597ec88..cecc6baaba93 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
791	#endif	791	#endif
792	#endif /* CONFIG_SMP */	792	#endif /* CONFIG_SMP */
793		793
		794	#ifdef CONFIG_UCLAMP_TASK
		795	/*
		796	* struct uclamp_bucket - Utilization clamp bucket
		797	* @value: utilization clamp value for tasks on this clamp bucket
		798	* @tasks: number of RUNNABLE tasks on this clamp bucket
		799	*
		800	* Keep track of how many tasks are RUNNABLE for a given utilization
		801	* clamp value.
		802	*/
		803	struct uclamp_bucket {
		804	unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
		805	unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
		806	};
		807
		808	/*
		809	* struct uclamp_rq - rq's utilization clamp
		810	* @value: currently active clamp values for a rq
		811	* @bucket: utilization clamp buckets affecting a rq
		812	*
		813	* Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
		814	* A clamp value is affecting a rq when there is at least one task RUNNABLE
		815	* (or actually running) with that value.
		816	*
		817	* There are up to UCLAMP_CNT possible different clamp values, currently there
		818	* are only two: minimum utilization and maximum utilization.
		819	*
		820	* All utilization clamping values are MAX aggregated, since:
		821	* - for util_min: we want to run the CPU at least at the max of the minimum
		822	* utilization required by its currently RUNNABLE tasks.
		823	* - for util_max: we want to allow the CPU to run up to the max of the
		824	* maximum utilization allowed by its currently RUNNABLE tasks.
		825	*
		826	* Since on each system we expect only a limited number of different
		827	* utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
		828	* the metrics required to compute all the per-rq utilization clamp values.
		829	*/
		830	struct uclamp_rq {
		831	unsigned int value;
		832	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
		833	};
		834	#endif /* CONFIG_UCLAMP_TASK */
		835
794	/*	836	/*
795	* This is the main, per-CPU runqueue data structure.	837	* This is the main, per-CPU runqueue data structure.
796	*	838	*
@@ -825,6 +867,11 @@ struct rq {
825	unsigned long nr_load_updates;	867	unsigned long nr_load_updates;
826	u64 nr_switches;	868	u64 nr_switches;
827		869
		870	#ifdef CONFIG_UCLAMP_TASK
		871	/* Utilization clamp values based on CPU's RUNNABLE tasks */
		872	struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
		873	#endif
		874
828	struct cfs_rq cfs;	875	struct cfs_rq cfs;
829	struct rt_rq rt;	876	struct rt_rq rt;
830	struct dl_rq dl;	877	struct dl_rq dl;
@@ -1639,6 +1686,10 @@ extern const u32 sched_prio_to_wmult[40];
1639	struct sched_class {	1686	struct sched_class {
1640	const struct sched_class *next;	1687	const struct sched_class *next;
1641		1688
		1689	#ifdef CONFIG_UCLAMP_TASK
		1690	int uclamp_enabled;
		1691	#endif
		1692
1642	void (enqueue_task) (struct rq rq, struct task_struct *p, int flags);	1693	void (enqueue_task) (struct rq rq, struct task_struct *p, int flags);
1643	void (dequeue_task) (struct rq rq, struct task_struct *p, int flags);	1694	void (dequeue_task) (struct rq rq, struct task_struct *p, int flags);
1644	void (yield_task) (struct rq rq);	1695	void (yield_task) (struct rq rq);