aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorPatrick Bellasi <patrick.bellasi@arm.com>2019-08-22 09:28:06 -0400
committerIngo Molnar <mingo@kernel.org>2019-09-03 03:17:37 -0400
commit2480c093130f64ac3a410504fa8b3db1fc4b87ce (patch)
tree1dbeca7920dfcadd75249ab41de421bbb2a3ad85 /kernel/sched/core.c
parenta55c7454a8c887b226a01d7eed088ccb5374d81e (diff)
sched/uclamp: Extend CPU's cgroup controller
The cgroup CPU bandwidth controller allows to assign a specified (maximum) bandwidth to the tasks of a group. However this bandwidth is defined and enforced only on a temporal base, without considering the actual frequency a CPU is running on. Thus, the amount of computation completed by a task within an allocated bandwidth can be very different depending on the actual frequency the CPU is running that task. The amount of computation can be affected also by the specific CPU a task is running on, especially when running on asymmetric capacity systems like Arm's big.LITTLE. With the availability of schedutil, the scheduler is now able to drive frequency selections based on actual task utilization. Moreover, the utilization clamping support provides a mechanism to bias the frequency selection operated by schedutil depending on constraints assigned to the tasks currently RUNNABLE on a CPU. Giving the mechanisms described above, it is now possible to extend the cpu controller to specify the minimum (or maximum) utilization which should be considered for tasks RUNNABLE on a cpu. This makes it possible to better defined the actual computational power assigned to task groups, thus improving the cgroup CPU bandwidth controller which is currently based just on time constraints. Extend the CPU controller with a couple of new attributes uclamp.{min,max} which allow to enforce utilization boosting and capping for all the tasks in a group. Specifically: - uclamp.min: defines the minimum utilization which should be considered i.e. the RUNNABLE tasks of this group will run at least at a minimum frequency which corresponds to the uclamp.min utilization - uclamp.max: defines the maximum utilization which should be considered i.e. the RUNNABLE tasks of this group will run up to a maximum frequency which corresponds to the uclamp.max utilization These attributes: a) are available only for non-root nodes, both on default and legacy hierarchies, while system wide clamps are defined by a generic interface which does not depends on cgroups. This system wide interface enforces constraints on tasks in the root node. b) enforce effective constraints at each level of the hierarchy which are a restriction of the group requests considering its parent's effective constraints. Root group effective constraints are defined by the system wide interface. This mechanism allows each (non-root) level of the hierarchy to: - request whatever clamp values it would like to get - effectively get only up to the maximum amount allowed by its parent c) have higher priority than task-specific clamps, defined via sched_setattr(), thus allowing to control and restrict task requests. Add two new attributes to the cpu controller to collect "requested" clamp values. Allow that at each non-root level of the hierarchy. Keep it simple by not caring now about "effective" values computation and propagation along the hierarchy. Update sysctl_sched_uclamp_handler() to use the newly introduced uclamp_mutex so that we serialize system default updates with cgroup relate updates. Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Michal Koutny <mkoutny@suse.com> Acked-by: Tejun Heo <tj@kernel.org> Cc: Alessio Balsini <balsini@android.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Viresh Kumar <viresh.kumar@linaro.org> Link: https://lkml.kernel.org/r/20190822132811.31294-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c193
1 files changed, 189 insertions, 4 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6661852907b..c186abed5c6d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
773} 773}
774 774
775#ifdef CONFIG_UCLAMP_TASK 775#ifdef CONFIG_UCLAMP_TASK
776/*
777 * Serializes updates of utilization clamp values
778 *
779 * The (slow-path) user-space triggers utilization clamp value updates which
780 * can require updates on (fast-path) scheduler's data structures used to
781 * support enqueue/dequeue operations.
782 * While the per-CPU rq lock protects fast-path update operations, user-space
783 * requests are serialized using a mutex to reduce the risk of conflicting
784 * updates or API abuses.
785 */
786static DEFINE_MUTEX(uclamp_mutex);
787
776/* Max allowed minimum utilization */ 788/* Max allowed minimum utilization */
777unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; 789unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
778 790
@@ -1010,10 +1022,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1010 loff_t *ppos) 1022 loff_t *ppos)
1011{ 1023{
1012 int old_min, old_max; 1024 int old_min, old_max;
1013 static DEFINE_MUTEX(mutex);
1014 int result; 1025 int result;
1015 1026
1016 mutex_lock(&mutex); 1027 mutex_lock(&uclamp_mutex);
1017 old_min = sysctl_sched_uclamp_util_min; 1028 old_min = sysctl_sched_uclamp_util_min;
1018 old_max = sysctl_sched_uclamp_util_max; 1029 old_max = sysctl_sched_uclamp_util_max;
1019 1030
@@ -1048,7 +1059,7 @@ undo:
1048 sysctl_sched_uclamp_util_min = old_min; 1059 sysctl_sched_uclamp_util_min = old_min;
1049 sysctl_sched_uclamp_util_max = old_max; 1060 sysctl_sched_uclamp_util_max = old_max;
1050done: 1061done:
1051 mutex_unlock(&mutex); 1062 mutex_unlock(&uclamp_mutex);
1052 1063
1053 return result; 1064 return result;
1054} 1065}
@@ -1137,6 +1148,8 @@ static void __init init_uclamp(void)
1137 unsigned int clamp_id; 1148 unsigned int clamp_id;
1138 int cpu; 1149 int cpu;
1139 1150
1151 mutex_init(&uclamp_mutex);
1152
1140 for_each_possible_cpu(cpu) { 1153 for_each_possible_cpu(cpu) {
1141 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); 1154 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
1142 cpu_rq(cpu)->uclamp_flags = 0; 1155 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1162,12 @@ static void __init init_uclamp(void)
1149 1162
1150 /* System defaults allow max clamp values for both indexes */ 1163 /* System defaults allow max clamp values for both indexes */
1151 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); 1164 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1152 for_each_clamp_id(clamp_id) 1165 for_each_clamp_id(clamp_id) {
1153 uclamp_default[clamp_id] = uc_max; 1166 uclamp_default[clamp_id] = uc_max;
1167#ifdef CONFIG_UCLAMP_TASK_GROUP
1168 root_task_group.uclamp_req[clamp_id] = uc_max;
1169#endif
1170 }
1154} 1171}
1155 1172
1156#else /* CONFIG_UCLAMP_TASK */ 1173#else /* CONFIG_UCLAMP_TASK */
@@ -6798,6 +6815,19 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
6798/* task_group_lock serializes the addition/removal of task groups */ 6815/* task_group_lock serializes the addition/removal of task groups */
6799static DEFINE_SPINLOCK(task_group_lock); 6816static DEFINE_SPINLOCK(task_group_lock);
6800 6817
6818static inline void alloc_uclamp_sched_group(struct task_group *tg,
6819 struct task_group *parent)
6820{
6821#ifdef CONFIG_UCLAMP_TASK_GROUP
6822 int clamp_id;
6823
6824 for_each_clamp_id(clamp_id) {
6825 uclamp_se_set(&tg->uclamp_req[clamp_id],
6826 uclamp_none(clamp_id), false);
6827 }
6828#endif
6829}
6830
6801static void sched_free_group(struct task_group *tg) 6831static void sched_free_group(struct task_group *tg)
6802{ 6832{
6803 free_fair_sched_group(tg); 6833 free_fair_sched_group(tg);
@@ -6821,6 +6851,8 @@ struct task_group *sched_create_group(struct task_group *parent)
6821 if (!alloc_rt_sched_group(tg, parent)) 6851 if (!alloc_rt_sched_group(tg, parent))
6822 goto err; 6852 goto err;
6823 6853
6854 alloc_uclamp_sched_group(tg, parent);
6855
6824 return tg; 6856 return tg;
6825 6857
6826err: 6858err:
@@ -7037,6 +7069,131 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
7037 sched_move_task(task); 7069 sched_move_task(task);
7038} 7070}
7039 7071
7072#ifdef CONFIG_UCLAMP_TASK_GROUP
7073
7074/*
7075 * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
7076 * C expression. Since there is no way to convert a macro argument (N) into a
7077 * character constant, use two levels of macros.
7078 */
7079#define _POW10(exp) ((unsigned int)1e##exp)
7080#define POW10(exp) _POW10(exp)
7081
7082struct uclamp_request {
7083#define UCLAMP_PERCENT_SHIFT 2
7084#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
7085 s64 percent;
7086 u64 util;
7087 int ret;
7088};
7089
7090static inline struct uclamp_request
7091capacity_from_percent(char *buf)
7092{
7093 struct uclamp_request req = {
7094 .percent = UCLAMP_PERCENT_SCALE,
7095 .util = SCHED_CAPACITY_SCALE,
7096 .ret = 0,
7097 };
7098
7099 buf = strim(buf);
7100 if (strcmp(buf, "max")) {
7101 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
7102 &req.percent);
7103 if (req.ret)
7104 return req;
7105 if (req.percent > UCLAMP_PERCENT_SCALE) {
7106 req.ret = -ERANGE;
7107 return req;
7108 }
7109
7110 req.util = req.percent << SCHED_CAPACITY_SHIFT;
7111 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
7112 }
7113
7114 return req;
7115}
7116
7117static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
7118 size_t nbytes, loff_t off,
7119 enum uclamp_id clamp_id)
7120{
7121 struct uclamp_request req;
7122 struct task_group *tg;
7123
7124 req = capacity_from_percent(buf);
7125 if (req.ret)
7126 return req.ret;
7127
7128 mutex_lock(&uclamp_mutex);
7129 rcu_read_lock();
7130
7131 tg = css_tg(of_css(of));
7132 if (tg->uclamp_req[clamp_id].value != req.util)
7133 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
7134
7135 /*
7136 * Because of not recoverable conversion rounding we keep track of the
7137 * exact requested value
7138 */
7139 tg->uclamp_pct[clamp_id] = req.percent;
7140
7141 rcu_read_unlock();
7142 mutex_unlock(&uclamp_mutex);
7143
7144 return nbytes;
7145}
7146
7147static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
7148 char *buf, size_t nbytes,
7149 loff_t off)
7150{
7151 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
7152}
7153
7154static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
7155 char *buf, size_t nbytes,
7156 loff_t off)
7157{
7158 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
7159}
7160
7161static inline void cpu_uclamp_print(struct seq_file *sf,
7162 enum uclamp_id clamp_id)
7163{
7164 struct task_group *tg;
7165 u64 util_clamp;
7166 u64 percent;
7167 u32 rem;
7168
7169 rcu_read_lock();
7170 tg = css_tg(seq_css(sf));
7171 util_clamp = tg->uclamp_req[clamp_id].value;
7172 rcu_read_unlock();
7173
7174 if (util_clamp == SCHED_CAPACITY_SCALE) {
7175 seq_puts(sf, "max\n");
7176 return;
7177 }
7178
7179 percent = tg->uclamp_pct[clamp_id];
7180 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
7181 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
7182}
7183
7184static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
7185{
7186 cpu_uclamp_print(sf, UCLAMP_MIN);
7187 return 0;
7188}
7189
7190static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
7191{
7192 cpu_uclamp_print(sf, UCLAMP_MAX);
7193 return 0;
7194}
7195#endif /* CONFIG_UCLAMP_TASK_GROUP */
7196
7040#ifdef CONFIG_FAIR_GROUP_SCHED 7197#ifdef CONFIG_FAIR_GROUP_SCHED
7041static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7198static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7042 struct cftype *cftype, u64 shareval) 7199 struct cftype *cftype, u64 shareval)
@@ -7382,6 +7539,20 @@ static struct cftype cpu_legacy_files[] = {
7382 .write_u64 = cpu_rt_period_write_uint, 7539 .write_u64 = cpu_rt_period_write_uint,
7383 }, 7540 },
7384#endif 7541#endif
7542#ifdef CONFIG_UCLAMP_TASK_GROUP
7543 {
7544 .name = "uclamp.min",
7545 .flags = CFTYPE_NOT_ON_ROOT,
7546 .seq_show = cpu_uclamp_min_show,
7547 .write = cpu_uclamp_min_write,
7548 },
7549 {
7550 .name = "uclamp.max",
7551 .flags = CFTYPE_NOT_ON_ROOT,
7552 .seq_show = cpu_uclamp_max_show,
7553 .write = cpu_uclamp_max_write,
7554 },
7555#endif
7385 { } /* Terminate */ 7556 { } /* Terminate */
7386}; 7557};
7387 7558
@@ -7549,6 +7720,20 @@ static struct cftype cpu_files[] = {
7549 .write = cpu_max_write, 7720 .write = cpu_max_write,
7550 }, 7721 },
7551#endif 7722#endif
7723#ifdef CONFIG_UCLAMP_TASK_GROUP
7724 {
7725 .name = "uclamp.min",
7726 .flags = CFTYPE_NOT_ON_ROOT,
7727 .seq_show = cpu_uclamp_min_show,
7728 .write = cpu_uclamp_min_write,
7729 },
7730 {
7731 .name = "uclamp.max",
7732 .flags = CFTYPE_NOT_ON_ROOT,
7733 .seq_show = cpu_uclamp_max_show,
7734 .write = cpu_uclamp_max_write,
7735 },
7736#endif
7552 { } /* terminate */ 7737 { } /* terminate */
7553}; 7738};
7554 7739