aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrick Bellasi <patrick.bellasi@arm.com>2019-06-21 04:42:07 -0400
committerIngo Molnar <mingo@kernel.org>2019-06-24 13:23:46 -0400
commita509a7cd79747074a2c018a45bbbc52d1f4aed44 (patch)
tree278bf6dc22f256956b614c5b5960f25010afc93a
parent1d6362fa0cfc8c7b243fa92924429d826599e691 (diff)
sched/uclamp: Extend sched_setattr() to support utilization clamping
The SCHED_DEADLINE scheduling class provides an advanced and formal model to define tasks requirements that can translate into proper decisions for both task placements and frequencies selections. Other classes have a more simplified model based on the POSIX concept of priorities. Such a simple priority based model however does not allow to exploit most advanced features of the Linux scheduler like, for example, driving frequencies selection via the schedutil cpufreq governor. However, also for non SCHED_DEADLINE tasks, it's still interesting to define tasks properties to support scheduler decisions. Utilization clamping exposes to user-space a new set of per-task attributes the scheduler can use as hints about the expected/required utilization for a task. This allows to implement a "proactive" per-task frequency control policy, a more advanced policy than the current one based just on "passive" measured task utilization. For example, it's possible to boost interactive tasks (e.g. to get better performance) or cap background tasks (e.g. to be more energy/thermal efficient). Introduce a new API to set utilization clamping values for a specified task by extending sched_setattr(), a syscall which already allows to define task specific properties for different scheduling classes. A new pair of attributes allows to specify a minimum and maximum utilization the scheduler can consider for a task. Do that by validating the required clamp values before and then applying the required changes using _the_ same pattern already in use for __setscheduler(). This ensures that the task is re-enqueued with the new clamp values. Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alessio Balsini <balsini@android.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/uapi/linux/sched.h12
-rw-r--r--include/uapi/linux/sched/types.h66
-rw-r--r--kernel/sched/core.c91
4 files changed, 161 insertions, 17 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5485f411e8e1..1113dd4706ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -587,6 +587,7 @@ struct sched_dl_entity {
587 * @value: clamp value "assigned" to a se 587 * @value: clamp value "assigned" to a se
588 * @bucket_id: bucket index corresponding to the "assigned" value 588 * @bucket_id: bucket index corresponding to the "assigned" value
589 * @active: the se is currently refcounted in a rq's bucket 589 * @active: the se is currently refcounted in a rq's bucket
590 * @user_defined: the requested clamp value comes from user-space
590 * 591 *
591 * The bucket_id is the index of the clamp bucket matching the clamp value 592 * The bucket_id is the index of the clamp bucket matching the clamp value
592 * which is pre-computed and stored to avoid expensive integer divisions from 593 * which is pre-computed and stored to avoid expensive integer divisions from
@@ -596,11 +597,19 @@ struct sched_dl_entity {
596 * which can be different from the clamp value "requested" from user-space. 597 * which can be different from the clamp value "requested" from user-space.
597 * This allows to know a task is refcounted in the rq's bucket corresponding 598 * This allows to know a task is refcounted in the rq's bucket corresponding
598 * to the "effective" bucket_id. 599 * to the "effective" bucket_id.
600 *
601 * The user_defined bit is set whenever a task has got a task-specific clamp
602 * value requested from userspace, i.e. the system defaults apply to this task
603 * just as a restriction. This allows to relax default clamps when a less
604 * restrictive task-specific value has been requested, thus allowing to
605 * implement a "nice" semantic. For example, a task running with a 20%
606 * default boost can still drop its own boosting to 0%.
599 */ 607 */
600struct uclamp_se { 608struct uclamp_se {
601 unsigned int value : bits_per(SCHED_CAPACITY_SCALE); 609 unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
602 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); 610 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
603 unsigned int active : 1; 611 unsigned int active : 1;
612 unsigned int user_defined : 1;
604}; 613};
605#endif /* CONFIG_UCLAMP_TASK */ 614#endif /* CONFIG_UCLAMP_TASK */
606 615
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 58b2368d3634..617bb59aa8ba 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -52,10 +52,20 @@
52#define SCHED_FLAG_RECLAIM 0x02 52#define SCHED_FLAG_RECLAIM 0x02
53#define SCHED_FLAG_DL_OVERRUN 0x04 53#define SCHED_FLAG_DL_OVERRUN 0x04
54#define SCHED_FLAG_KEEP_POLICY 0x08 54#define SCHED_FLAG_KEEP_POLICY 0x08
55#define SCHED_FLAG_KEEP_PARAMS 0x10
56#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
57#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
58
59#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
60 SCHED_FLAG_KEEP_PARAMS)
61
62#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
63 SCHED_FLAG_UTIL_CLAMP_MAX)
55 64
56#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ 65#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
57 SCHED_FLAG_RECLAIM | \ 66 SCHED_FLAG_RECLAIM | \
58 SCHED_FLAG_DL_OVERRUN | \ 67 SCHED_FLAG_DL_OVERRUN | \
59 SCHED_FLAG_KEEP_POLICY) 68 SCHED_FLAG_KEEP_ALL | \
69 SCHED_FLAG_UTIL_CLAMP)
60 70
61#endif /* _UAPI_LINUX_SCHED_H */ 71#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 10fbb8031930..c852153ddb0d 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -9,6 +9,7 @@ struct sched_param {
9}; 9};
10 10
11#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 11#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
12#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
12 13
13/* 14/*
14 * Extended scheduling parameters data structure. 15 * Extended scheduling parameters data structure.
@@ -21,8 +22,33 @@ struct sched_param {
21 * the tasks may be useful for a wide variety of application fields, e.g., 22 * the tasks may be useful for a wide variety of application fields, e.g.,
22 * multimedia, streaming, automation and control, and many others. 23 * multimedia, streaming, automation and control, and many others.
23 * 24 *
24 * This variant (sched_attr) is meant at describing a so-called 25 * This variant (sched_attr) allows to define additional attributes to
25 * sporadic time-constrained task. In such model a task is specified by: 26 * improve the scheduler knowledge about task requirements.
27 *
28 * Scheduling Class Attributes
29 * ===========================
30 *
31 * A subset of sched_attr attributes specifies the
32 * scheduling policy and relative POSIX attributes:
33 *
34 * @size size of the structure, for fwd/bwd compat.
35 *
36 * @sched_policy task's scheduling policy
37 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
38 * @sched_priority task's static priority (SCHED_FIFO/RR)
39 *
40 * Certain more advanced scheduling features can be controlled by a
41 * predefined set of flags via the attribute:
42 *
43 * @sched_flags for customizing the scheduler behaviour
44 *
45 * Sporadic Time-Constrained Task Attributes
46 * =========================================
47 *
48 * A subset of sched_attr attributes allows to describe a so-called
49 * sporadic time-constrained task.
50 *
51 * In such a model a task is specified by:
26 * - the activation period or minimum instance inter-arrival time; 52 * - the activation period or minimum instance inter-arrival time;
27 * - the maximum (or average, depending on the actual scheduling 53 * - the maximum (or average, depending on the actual scheduling
28 * discipline) computation time of all instances, a.k.a. runtime; 54 * discipline) computation time of all instances, a.k.a. runtime;
@@ -34,14 +60,8 @@ struct sched_param {
34 * than the runtime and must be completed by time instant t equal to 60 * than the runtime and must be completed by time instant t equal to
35 * the instance activation time + the deadline. 61 * the instance activation time + the deadline.
36 * 62 *
37 * This is reflected by the actual fields of the sched_attr structure: 63 * This is reflected by the following fields of the sched_attr structure:
38 * 64 *
39 * @size size of the structure, for fwd/bwd compat.
40 *
41 * @sched_policy task's scheduling policy
42 * @sched_flags for customizing the scheduler behaviour
43 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
44 * @sched_priority task's static priority (SCHED_FIFO/RR)
45 * @sched_deadline representative of the task's deadline 65 * @sched_deadline representative of the task's deadline
46 * @sched_runtime representative of the task's runtime 66 * @sched_runtime representative of the task's runtime
47 * @sched_period representative of the task's period 67 * @sched_period representative of the task's period
@@ -53,6 +73,29 @@ struct sched_param {
53 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the 73 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
54 * only user of this new interface. More information about the algorithm 74 * only user of this new interface. More information about the algorithm
55 * available in the scheduling class file or in Documentation/. 75 * available in the scheduling class file or in Documentation/.
76 *
77 * Task Utilization Attributes
78 * ===========================
79 *
80 * A subset of sched_attr attributes allows to specify the utilization
81 * expected for a task. These attributes allow to inform the scheduler about
82 * the utilization boundaries within which it should schedule the task. These
83 * boundaries are valuable hints to support scheduler decisions on both task
84 * placement and frequency selection.
85 *
86 * @sched_util_min represents the minimum utilization
87 * @sched_util_max represents the maximum utilization
88 *
89 * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It
90 * represents the percentage of CPU time used by a task when running at the
91 * maximum frequency on the highest capacity CPU of the system. For example, a
92 * 20% utilization task is a task running for 2ms every 10ms at maximum
93 * frequency.
94 *
95 * A task with a min utilization value bigger than 0 is more likely scheduled
96 * on a CPU with a capacity big enough to fit the specified value.
97 * A task with a max utilization value smaller than 1024 is more likely
98 * scheduled on a CPU with no more capacity than the specified value.
56 */ 99 */
57struct sched_attr { 100struct sched_attr {
58 __u32 size; 101 __u32 size;
@@ -70,6 +113,11 @@ struct sched_attr {
70 __u64 sched_runtime; 113 __u64 sched_runtime;
71 __u64 sched_deadline; 114 __u64 sched_deadline;
72 __u64 sched_period; 115 __u64 sched_period;
116
117 /* Utilization hints */
118 __u32 sched_util_min;
119 __u32 sched_util_max;
120
73}; 121};
74 122
75#endif /* _UAPI_LINUX_SCHED_TYPES_H */ 123#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d519f3f9789..e9a669266fa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -805,10 +805,12 @@ static inline unsigned int uclamp_none(int clamp_id)
805 return SCHED_CAPACITY_SCALE; 805 return SCHED_CAPACITY_SCALE;
806} 806}
807 807
808static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) 808static inline void uclamp_se_set(struct uclamp_se *uc_se,
809 unsigned int value, bool user_defined)
809{ 810{
810 uc_se->value = value; 811 uc_se->value = value;
811 uc_se->bucket_id = uclamp_bucket_id(value); 812 uc_se->bucket_id = uclamp_bucket_id(value);
813 uc_se->user_defined = user_defined;
812} 814}
813 815
814static inline unsigned int 816static inline unsigned int
@@ -1016,11 +1018,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1016 1018
1017 if (old_min != sysctl_sched_uclamp_util_min) { 1019 if (old_min != sysctl_sched_uclamp_util_min) {
1018 uclamp_se_set(&uclamp_default[UCLAMP_MIN], 1020 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1019 sysctl_sched_uclamp_util_min); 1021 sysctl_sched_uclamp_util_min, false);
1020 } 1022 }
1021 if (old_max != sysctl_sched_uclamp_util_max) { 1023 if (old_max != sysctl_sched_uclamp_util_max) {
1022 uclamp_se_set(&uclamp_default[UCLAMP_MAX], 1024 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1023 sysctl_sched_uclamp_util_max); 1025 sysctl_sched_uclamp_util_max, false);
1024 } 1026 }
1025 1027
1026 /* 1028 /*
@@ -1038,6 +1040,42 @@ done:
1038 return result; 1040 return result;
1039} 1041}
1040 1042
1043static int uclamp_validate(struct task_struct *p,
1044 const struct sched_attr *attr)
1045{
1046 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1047 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1048
1049 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1050 lower_bound = attr->sched_util_min;
1051 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1052 upper_bound = attr->sched_util_max;
1053
1054 if (lower_bound > upper_bound)
1055 return -EINVAL;
1056 if (upper_bound > SCHED_CAPACITY_SCALE)
1057 return -EINVAL;
1058
1059 return 0;
1060}
1061
1062static void __setscheduler_uclamp(struct task_struct *p,
1063 const struct sched_attr *attr)
1064{
1065 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1066 return;
1067
1068 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1069 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1070 attr->sched_util_min, true);
1071 }
1072
1073 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1074 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1075 attr->sched_util_max, true);
1076 }
1077}
1078
1041static void uclamp_fork(struct task_struct *p) 1079static void uclamp_fork(struct task_struct *p)
1042{ 1080{
1043 unsigned int clamp_id; 1081 unsigned int clamp_id;
@@ -1059,11 +1097,11 @@ static void __init init_uclamp(void)
1059 1097
1060 for_each_clamp_id(clamp_id) { 1098 for_each_clamp_id(clamp_id) {
1061 uclamp_se_set(&init_task.uclamp_req[clamp_id], 1099 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1062 uclamp_none(clamp_id)); 1100 uclamp_none(clamp_id), false);
1063 } 1101 }
1064 1102
1065 /* System defaults allow max clamp values for both indexes */ 1103 /* System defaults allow max clamp values for both indexes */
1066 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); 1104 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1067 for_each_clamp_id(clamp_id) 1105 for_each_clamp_id(clamp_id)
1068 uclamp_default[clamp_id] = uc_max; 1106 uclamp_default[clamp_id] = uc_max;
1069} 1107}
@@ -1071,6 +1109,13 @@ static void __init init_uclamp(void)
1071#else /* CONFIG_UCLAMP_TASK */ 1109#else /* CONFIG_UCLAMP_TASK */
1072static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } 1110static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1073static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 1111static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1112static inline int uclamp_validate(struct task_struct *p,
1113 const struct sched_attr *attr)
1114{
1115 return -EOPNOTSUPP;
1116}
1117static void __setscheduler_uclamp(struct task_struct *p,
1118 const struct sched_attr *attr) { }
1074static inline void uclamp_fork(struct task_struct *p) { } 1119static inline void uclamp_fork(struct task_struct *p) { }
1075static inline void init_uclamp(void) { } 1120static inline void init_uclamp(void) { }
1076#endif /* CONFIG_UCLAMP_TASK */ 1121#endif /* CONFIG_UCLAMP_TASK */
@@ -4412,6 +4457,13 @@ static void __setscheduler_params(struct task_struct *p,
4412static void __setscheduler(struct rq *rq, struct task_struct *p, 4457static void __setscheduler(struct rq *rq, struct task_struct *p,
4413 const struct sched_attr *attr, bool keep_boost) 4458 const struct sched_attr *attr, bool keep_boost)
4414{ 4459{
4460 /*
4461 * If params can't change scheduling class changes aren't allowed
4462 * either.
4463 */
4464 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4465 return;
4466
4415 __setscheduler_params(p, attr); 4467 __setscheduler_params(p, attr);
4416 4468
4417 /* 4469 /*
@@ -4549,6 +4601,13 @@ recheck:
4549 return retval; 4601 return retval;
4550 } 4602 }
4551 4603
4604 /* Update task specific "requested" clamps */
4605 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
4606 retval = uclamp_validate(p, attr);
4607 if (retval)
4608 return retval;
4609 }
4610
4552 /* 4611 /*
4553 * Make sure no PI-waiters arrive (or leave) while we are 4612 * Make sure no PI-waiters arrive (or leave) while we are
4554 * changing the priority of the task: 4613 * changing the priority of the task:
@@ -4578,6 +4637,8 @@ recheck:
4578 goto change; 4637 goto change;
4579 if (dl_policy(policy) && dl_param_changed(p, attr)) 4638 if (dl_policy(policy) && dl_param_changed(p, attr))
4580 goto change; 4639 goto change;
4640 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4641 goto change;
4581 4642
4582 p->sched_reset_on_fork = reset_on_fork; 4643 p->sched_reset_on_fork = reset_on_fork;
4583 task_rq_unlock(rq, p, &rf); 4644 task_rq_unlock(rq, p, &rf);
@@ -4658,7 +4719,9 @@ change:
4658 put_prev_task(rq, p); 4719 put_prev_task(rq, p);
4659 4720
4660 prev_class = p->sched_class; 4721 prev_class = p->sched_class;
4722
4661 __setscheduler(rq, p, attr, pi); 4723 __setscheduler(rq, p, attr, pi);
4724 __setscheduler_uclamp(p, attr);
4662 4725
4663 if (queued) { 4726 if (queued) {
4664 /* 4727 /*
@@ -4834,6 +4897,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
4834 if (ret) 4897 if (ret)
4835 return -EFAULT; 4898 return -EFAULT;
4836 4899
4900 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
4901 size < SCHED_ATTR_SIZE_VER1)
4902 return -EINVAL;
4903
4837 /* 4904 /*
4838 * XXX: Do we want to be lenient like existing syscalls; or do we want 4905 * XXX: Do we want to be lenient like existing syscalls; or do we want
4839 * to be strict and return an error on out-of-bounds values? 4906 * to be strict and return an error on out-of-bounds values?
@@ -4903,10 +4970,15 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4903 rcu_read_lock(); 4970 rcu_read_lock();
4904 retval = -ESRCH; 4971 retval = -ESRCH;
4905 p = find_process_by_pid(pid); 4972 p = find_process_by_pid(pid);
4906 if (p != NULL) 4973 if (likely(p))
4907 retval = sched_setattr(p, &attr); 4974 get_task_struct(p);
4908 rcu_read_unlock(); 4975 rcu_read_unlock();
4909 4976
4977 if (likely(p)) {
4978 retval = sched_setattr(p, &attr);
4979 put_task_struct(p);
4980 }
4981
4910 return retval; 4982 return retval;
4911} 4983}
4912 4984
@@ -5057,6 +5129,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5057 else 5129 else
5058 attr.sched_nice = task_nice(p); 5130 attr.sched_nice = task_nice(p);
5059 5131
5132#ifdef CONFIG_UCLAMP_TASK
5133 attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5134 attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5135#endif
5136
5060 rcu_read_unlock(); 5137 rcu_read_unlock();
5061 5138
5062 retval = sched_read_attr(uattr, &attr, size); 5139 retval = sched_read_attr(uattr, &attr, size);