cpufreq: schedutil: New governor based on scheduler utilization data

Add a new cpufreq scaling governor, called "schedutil", that uses scheduler-provided CPU utilization information as input for making its decisions. Doing that is possible after commit 34e2c555f3e1 (cpufreq: Add mechanism for registering utilization update callbacks) that introduced cpufreq_update_util() called by the scheduler on utilization changes (from CFS) and RT/DL task status updates. In particular, CPU frequency scaling decisions may be based on the the utilization data passed to cpufreq_update_util() by CFS. The new governor is relatively simple. The frequency selection formula used by it depends on whether or not the utilization is frequency-invariant. In the frequency-invariant case the new CPU frequency is given by next_freq = 1.25 * max_freq * util / max where util and max are the last two arguments of cpufreq_update_util(). In turn, if util is not frequency-invariant, the maximum frequency in the above formula is replaced with the current frequency of the CPU: next_freq = 1.25 * curr_freq * util / max The coefficient 1.25 corresponds to the frequency tipping point at (util / max) = 0.8. All of the computations are carried out in the utilization update handlers provided by the new governor. One of those handlers is used for cpufreq policies shared between multiple CPUs and the other one is for policies with one CPU only (and therefore it doesn't need to use any extra synchronization means). The governor supports fast frequency switching if that is supported by the cpufreq driver in use and possible for the given policy. In the fast switching case, all operations of the governor take place in its utilization update handlers. If fast switching cannot be used, the frequency switch operations are carried out with the help of a work item which only calls __cpufreq_driver_target() (under a mutex) to trigger a frequency update (to a value already computed beforehand in one of the utilization update handlers). Currently, the governor treats all of the RT and DL tasks as "unknown utilization" and sets the frequency to the allowed maximum when updated from the RT or DL sched classes. That heavy-handed approach should be replaced with something more subtle and specifically targeted at RT and DL tasks. The governor shares some tunables management code with the "ondemand" and "conservative" governors and uses some common definitions from cpufreq_governor.h, but apart from that it is stand-alone. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Viresh Kumar <viresh.kumar@linaro.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 2016-04-01 19:09:12 -0400
committer: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 2016-04-01 19:09:12 -0400
commit: 9bdcb44e391da5c41b98573bf0305a0e0b1c9569 (patch)
tree: d9785da0dfc47ca196fd8401e072a07623827793 /kernel/sched
parent: b7898fda5bc7e786e76ce24fbd2ec993b08ec518 (diff)
3 files changed, 537 insertions, 0 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 414d9c16da42..5e59b832ae2b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..d27ae064b476
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,528 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+#include "sched.h"
+struct sugov_tunables {
+        struct gov_attr_set attr_set;
+        unsigned int rate_limit_us;
+};
+struct sugov_policy {
+        struct cpufreq_policy *policy;
+        struct sugov_tunables *tunables;
+        struct list_head tunables_hook;
+        raw_spinlock_t update_lock;  /* For shared policies */
+        u64 last_freq_update_time;
+        s64 freq_update_delay_ns;
+        unsigned int next_freq;
+        /* The next fields are only needed if fast switch cannot be used. */
+        struct irq_work irq_work;
+        struct work_struct work;
+        struct mutex work_lock;
+        bool work_in_progress;
+        bool need_freq_update;
+};
+struct sugov_cpu {
+        struct update_util_data update_util;
+        struct sugov_policy *sg_policy;
+        /* The fields below are only needed when sharing a policy. */
+        unsigned long util;
+        unsigned long max;
+        u64 last_update;
+};
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+/************************ Governor internals ***********************/
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+        s64 delta_ns;
+        if (sg_policy->work_in_progress)
+                return false;
+        if (unlikely(sg_policy->need_freq_update)) {
+                sg_policy->need_freq_update = false;
+                /*
+                 * This happens when limits change, so forget the previous
+                 * next_freq value and force an update.
+                 */
+                sg_policy->next_freq = UINT_MAX;
+                return true;
+        }
+        delta_ns = time - sg_policy->last_freq_update_time;
+        return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+                                unsigned int next_freq)
+{
+        struct cpufreq_policy *policy = sg_policy->policy;
+        sg_policy->last_freq_update_time = time;
+        if (policy->fast_switch_enabled) {
+                if (sg_policy->next_freq == next_freq) {
+                        trace_cpu_frequency(policy->cur, smp_processor_id());
+                        return;
+                }
+                sg_policy->next_freq = next_freq;
+                next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+                if (next_freq == CPUFREQ_ENTRY_INVALID)
+                        return;
+                policy->cur = next_freq;
+                trace_cpu_frequency(next_freq, smp_processor_id());
+        } else if (sg_policy->next_freq != next_freq) {
+                sg_policy->next_freq = next_freq;
+                sg_policy->work_in_progress = true;
+                irq_work_queue(&sg_policy->irq_work);
+        }
+}
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @policy: cpufreq policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ */
+static unsigned int get_next_freq(struct cpufreq_policy *policy,
+                                  unsigned long util, unsigned long max)
+{
+        unsigned int freq = arch_scale_freq_invariant() ?
+                                policy->cpuinfo.max_freq : policy->cur;
+        return (freq + (freq >> 2)) * util / max;
+}
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+                                unsigned long util, unsigned long max)
+{
+        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+        struct cpufreq_policy *policy = sg_policy->policy;
+        unsigned int next_f;
+        if (!sugov_should_update_freq(sg_policy, time))
+                return;
+        next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+                        get_next_freq(policy, util, max);
+        sugov_update_commit(sg_policy, time, next_f);
+}
+static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+                                           unsigned long util, unsigned long max)
+{
+        struct cpufreq_policy *policy = sg_policy->policy;
+        unsigned int max_f = policy->cpuinfo.max_freq;
+        u64 last_freq_update_time = sg_policy->last_freq_update_time;
+        unsigned int j;
+        if (util == ULONG_MAX)
+                return max_f;
+        for_each_cpu(j, policy->cpus) {
+                struct sugov_cpu *j_sg_cpu;
+                unsigned long j_util, j_max;
+                s64 delta_ns;
+                if (j == smp_processor_id())
+                        continue;
+                j_sg_cpu = &per_cpu(sugov_cpu, j);
+                /*
+                 * If the CPU utilization was last updated before the previous
+                 * frequency update and the time elapsed between the last update
+                 * of the CPU utilization and the last frequency update is long
+                 * enough, don't take the CPU into account as it probably is
+                 * idle now.
+                 */
+                delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+                if (delta_ns > TICK_NSEC)
+                        continue;
+                j_util = j_sg_cpu->util;
+                if (j_util == ULONG_MAX)
+                        return max_f;
+                j_max = j_sg_cpu->max;
+                if (j_util * max > j_max * util) {
+                        util = j_util;
+                        max = j_max;
+                }
+        }
+        return get_next_freq(policy, util, max);
+}
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+                                unsigned long util, unsigned long max)
+{
+        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+        unsigned int next_f;
+        raw_spin_lock(&sg_policy->update_lock);
+        sg_cpu->util = util;
+        sg_cpu->max = max;
+        sg_cpu->last_update = time;
+        if (sugov_should_update_freq(sg_policy, time)) {
+                next_f = sugov_next_freq_shared(sg_policy, util, max);
+                sugov_update_commit(sg_policy, time, next_f);
+        }
+        raw_spin_unlock(&sg_policy->update_lock);
+}
+static void sugov_work(struct work_struct *work)
+{
+        struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+        mutex_lock(&sg_policy->work_lock);
+        __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+                                CPUFREQ_RELATION_L);
+        mutex_unlock(&sg_policy->work_lock);
+        sg_policy->work_in_progress = false;
+}
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+        struct sugov_policy *sg_policy;
+        sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+        schedule_work_on(smp_processor_id(), &sg_policy->work);
+}
+/************************** sysfs interface ************************/
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+        return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+        return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+                                   size_t count)
+{
+        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+        struct sugov_policy *sg_policy;
+        unsigned int rate_limit_us;
+        if (kstrtouint(buf, 10, &rate_limit_us))
+                return -EINVAL;
+        tunables->rate_limit_us = rate_limit_us;
+        list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+                sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+        return count;
+}
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static struct attribute *sugov_attributes[] = {
+        &rate_limit_us.attr,
+        NULL
+};
+static struct kobj_type sugov_tunables_ktype = {
+        .default_attrs = sugov_attributes,
+        .sysfs_ops = &governor_sysfs_ops,
+};
+/********************** cpufreq governor interface *********************/
+static struct cpufreq_governor schedutil_gov;
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy;
+        sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+        if (!sg_policy)
+                return NULL;
+        sg_policy->policy = policy;
+        init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+        INIT_WORK(&sg_policy->work, sugov_work);
+        mutex_init(&sg_policy->work_lock);
+        raw_spin_lock_init(&sg_policy->update_lock);
+        return sg_policy;
+}
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+        mutex_destroy(&sg_policy->work_lock);
+        kfree(sg_policy);
+}
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+        struct sugov_tunables *tunables;
+        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+        if (tunables) {
+                gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+                if (!have_governor_per_policy())
+                        global_tunables = tunables;
+        }
+        return tunables;
+}
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+        if (!have_governor_per_policy())
+                global_tunables = NULL;
+        kfree(tunables);
+}
+static int sugov_init(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy;
+        struct sugov_tunables *tunables;
+        unsigned int lat;
+        int ret = 0;
+        /* State should be equivalent to EXIT */
+        if (policy->governor_data)
+                return -EBUSY;
+        sg_policy = sugov_policy_alloc(policy);
+        if (!sg_policy)
+                return -ENOMEM;
+        mutex_lock(&global_tunables_lock);
+        if (global_tunables) {
+                if (WARN_ON(have_governor_per_policy())) {
+                        ret = -EINVAL;
+                        goto free_sg_policy;
+                }
+                policy->governor_data = sg_policy;
+                sg_policy->tunables = global_tunables;
+                gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+                goto out;
+        }
+        tunables = sugov_tunables_alloc(sg_policy);
+        if (!tunables) {
+                ret = -ENOMEM;
+                goto free_sg_policy;
+        }
+        tunables->rate_limit_us = LATENCY_MULTIPLIER;
+        lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+        if (lat)
+                tunables->rate_limit_us *= lat;
+        policy->governor_data = sg_policy;
+        sg_policy->tunables = tunables;
+        ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+                                   get_governor_parent_kobj(policy), "%s",
+                                   schedutil_gov.name);
+        if (ret)
+                goto fail;
+ out:
+        mutex_unlock(&global_tunables_lock);
+        cpufreq_enable_fast_switch(policy);
+        return 0;
+ fail:
+        policy->governor_data = NULL;
+        sugov_tunables_free(tunables);
+ free_sg_policy:
+        mutex_unlock(&global_tunables_lock);
+        sugov_policy_free(sg_policy);
+        pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+        return ret;
+}
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy = policy->governor_data;
+        struct sugov_tunables *tunables = sg_policy->tunables;
+        unsigned int count;
+        mutex_lock(&global_tunables_lock);
+        count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+        policy->governor_data = NULL;
+        if (!count)
+                sugov_tunables_free(tunables);
+        mutex_unlock(&global_tunables_lock);
+        sugov_policy_free(sg_policy);
+        return 0;
+}
+static int sugov_start(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy = policy->governor_data;
+        unsigned int cpu;
+        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+        sg_policy->last_freq_update_time = 0;
+        sg_policy->next_freq = UINT_MAX;
+        sg_policy->work_in_progress = false;
+        sg_policy->need_freq_update = false;
+        for_each_cpu(cpu, policy->cpus) {
+                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+                sg_cpu->sg_policy = sg_policy;
+                if (policy_is_shared(policy)) {
+                        sg_cpu->util = ULONG_MAX;
+                        sg_cpu->max = 0;
+                        sg_cpu->last_update = 0;
+                        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+                                                     sugov_update_shared);
+                } else {
+                        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+                                                     sugov_update_single);
+                }
+        }
+        return 0;
+}
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy = policy->governor_data;
+        unsigned int cpu;
+        for_each_cpu(cpu, policy->cpus)
+                cpufreq_remove_update_util_hook(cpu);
+        synchronize_sched();
+        irq_work_sync(&sg_policy->irq_work);
+        cancel_work_sync(&sg_policy->work);
+        return 0;
+}
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+        struct sugov_policy *sg_policy = policy->governor_data;
+        if (!policy->fast_switch_enabled) {
+                mutex_lock(&sg_policy->work_lock);
+                if (policy->max < policy->cur)
+                        __cpufreq_driver_target(policy, policy->max,
+                                                CPUFREQ_RELATION_H);
+                else if (policy->min > policy->cur)
+                        __cpufreq_driver_target(policy, policy->min,
+                                                CPUFREQ_RELATION_L);
+                mutex_unlock(&sg_policy->work_lock);
+        }
+        sg_policy->need_freq_update = true;
+        return 0;
+}
+int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
+{
+        if (event == CPUFREQ_GOV_POLICY_INIT) {
+                return sugov_init(policy);
+        } else if (policy->governor_data) {
+                switch (event) {
+                case CPUFREQ_GOV_POLICY_EXIT:
+                        return sugov_exit(policy);
+                case CPUFREQ_GOV_START:
+                        return sugov_start(policy);
+                case CPUFREQ_GOV_STOP:
+                        return sugov_stop(policy);
+                case CPUFREQ_GOV_LIMITS:
+                        return sugov_limits(policy);
+                }
+        }
+        return -EINVAL;
+}
+static struct cpufreq_governor schedutil_gov = {
+        .name = "schedutil",
+        .governor = sugov_governor,
+        .owner = THIS_MODULE,
+};
+static int __init sugov_module_init(void)
+{
+        return cpufreq_register_governor(&schedutil_gov);
+}
+static void __exit sugov_module_exit(void)
+{
+        cpufreq_unregister_governor(&schedutil_gov);
+}
+MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
+MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
+MODULE_LICENSE("GPL");
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+        return &schedutil_gov;
+}
+fs_initcall(sugov_module_init);
+#else
+module_init(sugov_module_init);
+#endif
+module_exit(sugov_module_exit);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec2e8d23527e..921d6e5d33b7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1842,6 +1842,14 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
 static inline void cpufreq_trigger_update(u64 time) {}
 #endif /* CONFIG_CPU_FREQ */
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()     (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()     (false)
+#endif
 static inline void account_reset_rq(struct rq *rq)
 {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
author	Rafael J. Wysocki <rafael.j.wysocki@intel.com>	2016-04-01 19:09:12 -0400
committer	Rafael J. Wysocki <rafael.j.wysocki@intel.com>	2016-04-01 19:09:12 -0400
commit	9bdcb44e391da5c41b98573bf0305a0e0b1c9569 (patch)
tree	d9785da0dfc47ca196fd8401e072a07623827793 /kernel/sched
parent	b7898fda5bc7e786e76ce24fbd2ec993b08ec518 (diff)

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 414d9c16da42..5e59b832ae2b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
24	obj-$(CONFIG_SCHED_DEBUG) += debug.o	24	obj-$(CONFIG_SCHED_DEBUG) += debug.o
25	obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o	25	obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
26	obj-$(CONFIG_CPU_FREQ) += cpufreq.o	26	obj-$(CONFIG_CPU_FREQ) += cpufreq.o
		27	obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o


diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..d27ae064b476 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,528 @@
		1	/*
		2	* CPUFreq governor based on scheduler-provided CPU utilization data.
		3	*
		4	* Copyright (C) 2016, Intel Corporation
		5	* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
		6	*
		7	* This program is free software; you can redistribute it and/or modify
		8	* it under the terms of the GNU General Public License version 2 as
		9	* published by the Free Software Foundation.
		10	*/
		11
		12	#include <linux/cpufreq.h>
		13	#include <linux/module.h>
		14	#include <linux/slab.h>
		15	#include <trace/events/power.h>
		16
		17	#include "sched.h"
		18
		19	struct sugov_tunables {
		20	struct gov_attr_set attr_set;
		21	unsigned int rate_limit_us;
		22	};
		23
		24	struct sugov_policy {
		25	struct cpufreq_policy *policy;
		26
		27	struct sugov_tunables *tunables;
		28	struct list_head tunables_hook;
		29
		30	raw_spinlock_t update_lock; /* For shared policies */
		31	u64 last_freq_update_time;
		32	s64 freq_update_delay_ns;
		33	unsigned int next_freq;
		34
		35	/* The next fields are only needed if fast switch cannot be used. */
		36	struct irq_work irq_work;
		37	struct work_struct work;
		38	struct mutex work_lock;
		39	bool work_in_progress;
		40
		41	bool need_freq_update;
		42	};
		43
		44	struct sugov_cpu {
		45	struct update_util_data update_util;
		46	struct sugov_policy *sg_policy;
		47
		48	/* The fields below are only needed when sharing a policy. */
		49	unsigned long util;
		50	unsigned long max;
		51	u64 last_update;
		52	};
		53
		54	static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
		55
		56	/********************** Governor internals *********************/
		57
		58	static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
		59	{
		60	s64 delta_ns;
		61
		62	if (sg_policy->work_in_progress)
		63	return false;
		64
		65	if (unlikely(sg_policy->need_freq_update)) {
		66	sg_policy->need_freq_update = false;
		67	/*
		68	* This happens when limits change, so forget the previous
		69	* next_freq value and force an update.
		70	*/
		71	sg_policy->next_freq = UINT_MAX;
		72	return true;
		73	}
		74
		75	delta_ns = time - sg_policy->last_freq_update_time;
		76	return delta_ns >= sg_policy->freq_update_delay_ns;
		77	}
		78
		79	static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
		80	unsigned int next_freq)
		81	{
		82	struct cpufreq_policy *policy = sg_policy->policy;
		83
		84	sg_policy->last_freq_update_time = time;
		85
		86	if (policy->fast_switch_enabled) {
		87	if (sg_policy->next_freq == next_freq) {
		88	trace_cpu_frequency(policy->cur, smp_processor_id());
		89	return;
		90	}
		91	sg_policy->next_freq = next_freq;
		92	next_freq = cpufreq_driver_fast_switch(policy, next_freq);
		93	if (next_freq == CPUFREQ_ENTRY_INVALID)
		94	return;
		95
		96	policy->cur = next_freq;
		97	trace_cpu_frequency(next_freq, smp_processor_id());
		98	} else if (sg_policy->next_freq != next_freq) {
		99	sg_policy->next_freq = next_freq;
		100	sg_policy->work_in_progress = true;
		101	irq_work_queue(&sg_policy->irq_work);
		102	}
		103	}
		104
		105	/**
		106	* get_next_freq - Compute a new frequency for a given cpufreq policy.
		107	* @policy: cpufreq policy object to compute the new frequency for.
		108	* @util: Current CPU utilization.
		109	* @max: CPU capacity.
		110	*
		111	* If the utilization is frequency-invariant, choose the new frequency to be
		112	* proportional to it, that is
		113	*
		114	* next_freq = C * max_freq * util / max
		115	*
		116	* Otherwise, approximate the would-be frequency-invariant utilization by
		117	* util_raw * (curr_freq / max_freq) which leads to
		118	*
		119	* next_freq = C * curr_freq * util_raw / max
		120	*
		121	* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
		122	*/
		123	static unsigned int get_next_freq(struct cpufreq_policy *policy,
		124	unsigned long util, unsigned long max)
		125	{
		126	unsigned int freq = arch_scale_freq_invariant() ?
		127	policy->cpuinfo.max_freq : policy->cur;
		128
		129	return (freq + (freq >> 2)) * util / max;
		130	}
		131
		132	static void sugov_update_single(struct update_util_data *hook, u64 time,
		133	unsigned long util, unsigned long max)
		134	{
		135	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
		136	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
		137	struct cpufreq_policy *policy = sg_policy->policy;
		138	unsigned int next_f;
		139
		140	if (!sugov_should_update_freq(sg_policy, time))
		141	return;
		142
		143	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
		144	get_next_freq(policy, util, max);
		145	sugov_update_commit(sg_policy, time, next_f);
		146	}
		147
		148	static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
		149	unsigned long util, unsigned long max)
		150	{
		151	struct cpufreq_policy *policy = sg_policy->policy;
		152	unsigned int max_f = policy->cpuinfo.max_freq;
		153	u64 last_freq_update_time = sg_policy->last_freq_update_time;
		154	unsigned int j;
		155
		156	if (util == ULONG_MAX)
		157	return max_f;
		158
		159	for_each_cpu(j, policy->cpus) {
		160	struct sugov_cpu *j_sg_cpu;
		161	unsigned long j_util, j_max;
		162	s64 delta_ns;
		163
		164	if (j == smp_processor_id())
		165	continue;
		166
		167	j_sg_cpu = &per_cpu(sugov_cpu, j);
		168	/*
		169	* If the CPU utilization was last updated before the previous
		170	* frequency update and the time elapsed between the last update
		171	* of the CPU utilization and the last frequency update is long
		172	* enough, don't take the CPU into account as it probably is
		173	* idle now.
		174	*/
		175	delta_ns = last_freq_update_time - j_sg_cpu->last_update;
		176	if (delta_ns > TICK_NSEC)
		177	continue;
		178
		179	j_util = j_sg_cpu->util;
		180	if (j_util == ULONG_MAX)
		181	return max_f;
		182
		183	j_max = j_sg_cpu->max;
		184	if (j_util * max > j_max * util) {
		185	util = j_util;
		186	max = j_max;
		187	}
		188	}
		189
		190	return get_next_freq(policy, util, max);
		191	}
		192
		193	static void sugov_update_shared(struct update_util_data *hook, u64 time,
		194	unsigned long util, unsigned long max)
		195	{
		196	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
		197	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
		198	unsigned int next_f;
		199
		200	raw_spin_lock(&sg_policy->update_lock);
		201
		202	sg_cpu->util = util;
		203	sg_cpu->max = max;
		204	sg_cpu->last_update = time;
		205
		206	if (sugov_should_update_freq(sg_policy, time)) {
		207	next_f = sugov_next_freq_shared(sg_policy, util, max);
		208	sugov_update_commit(sg_policy, time, next_f);
		209	}
		210
		211	raw_spin_unlock(&sg_policy->update_lock);
		212	}
		213
		214	static void sugov_work(struct work_struct *work)
		215	{
		216	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
		217
		218	mutex_lock(&sg_policy->work_lock);
		219	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
		220	CPUFREQ_RELATION_L);
		221	mutex_unlock(&sg_policy->work_lock);
		222
		223	sg_policy->work_in_progress = false;
		224	}
		225
		226	static void sugov_irq_work(struct irq_work *irq_work)
		227	{
		228	struct sugov_policy *sg_policy;
		229
		230	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
		231	schedule_work_on(smp_processor_id(), &sg_policy->work);
		232	}
		233
		234	/************************ sysfs interface **********************/
		235
		236	static struct sugov_tunables *global_tunables;
		237	static DEFINE_MUTEX(global_tunables_lock);
		238
		239	static inline struct sugov_tunables to_sugov_tunables(struct gov_attr_set attr_set)
		240	{
		241	return container_of(attr_set, struct sugov_tunables, attr_set);
		242	}
		243
		244	static ssize_t rate_limit_us_show(struct gov_attr_set attr_set, char buf)
		245	{
		246	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
		247
		248	return sprintf(buf, "%u\n", tunables->rate_limit_us);
		249	}
		250
		251	static ssize_t rate_limit_us_store(struct gov_attr_set attr_set, const char buf,
		252	size_t count)
		253	{
		254	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
		255	struct sugov_policy *sg_policy;
		256	unsigned int rate_limit_us;
		257
		258	if (kstrtouint(buf, 10, &rate_limit_us))
		259	return -EINVAL;
		260
		261	tunables->rate_limit_us = rate_limit_us;
		262
		263	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
		264	sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
		265
		266	return count;
		267	}
		268
		269	static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
		270
		271	static struct attribute *sugov_attributes[] = {
		272	&rate_limit_us.attr,
		273	NULL
		274	};
		275
		276	static struct kobj_type sugov_tunables_ktype = {
		277	.default_attrs = sugov_attributes,
		278	.sysfs_ops = &governor_sysfs_ops,
		279	};
		280
		281	/******************** cpufreq governor interface *******************/
		282
		283	static struct cpufreq_governor schedutil_gov;
		284
		285	static struct sugov_policy sugov_policy_alloc(struct cpufreq_policy policy)
		286	{
		287	struct sugov_policy *sg_policy;
		288
		289	sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
		290	if (!sg_policy)
		291	return NULL;
		292
		293	sg_policy->policy = policy;
		294	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
		295	INIT_WORK(&sg_policy->work, sugov_work);
		296	mutex_init(&sg_policy->work_lock);
		297	raw_spin_lock_init(&sg_policy->update_lock);
		298	return sg_policy;
		299	}
		300
		301	static void sugov_policy_free(struct sugov_policy *sg_policy)
		302	{
		303	mutex_destroy(&sg_policy->work_lock);
		304	kfree(sg_policy);
		305	}
		306
		307	static struct sugov_tunables sugov_tunables_alloc(struct sugov_policy sg_policy)
		308	{
		309	struct sugov_tunables *tunables;
		310
		311	tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
		312	if (tunables) {
		313	gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
		314	if (!have_governor_per_policy())
		315	global_tunables = tunables;
		316	}
		317	return tunables;
		318	}
		319
		320	static void sugov_tunables_free(struct sugov_tunables *tunables)
		321	{
		322	if (!have_governor_per_policy())
		323	global_tunables = NULL;
		324
		325	kfree(tunables);
		326	}
		327
		328	static int sugov_init(struct cpufreq_policy *policy)
		329	{
		330	struct sugov_policy *sg_policy;
		331	struct sugov_tunables *tunables;
		332	unsigned int lat;
		333	int ret = 0;
		334
		335	/* State should be equivalent to EXIT */
		336	if (policy->governor_data)
		337	return -EBUSY;
		338
		339	sg_policy = sugov_policy_alloc(policy);
		340	if (!sg_policy)
		341	return -ENOMEM;
		342
		343	mutex_lock(&global_tunables_lock);
		344
		345	if (global_tunables) {
		346	if (WARN_ON(have_governor_per_policy())) {
		347	ret = -EINVAL;
		348	goto free_sg_policy;
		349	}
		350	policy->governor_data = sg_policy;
		351	sg_policy->tunables = global_tunables;
		352
		353	gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
		354	goto out;
		355	}
		356
		357	tunables = sugov_tunables_alloc(sg_policy);
		358	if (!tunables) {
		359	ret = -ENOMEM;
		360	goto free_sg_policy;
		361	}
		362
		363	tunables->rate_limit_us = LATENCY_MULTIPLIER;
		364	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
		365	if (lat)
		366	tunables->rate_limit_us *= lat;
		367
		368	policy->governor_data = sg_policy;
		369	sg_policy->tunables = tunables;
		370
		371	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
		372	get_governor_parent_kobj(policy), "%s",
		373	schedutil_gov.name);
		374	if (ret)
		375	goto fail;
		376
		377	out:
		378	mutex_unlock(&global_tunables_lock);
		379
		380	cpufreq_enable_fast_switch(policy);
		381	return 0;
		382
		383	fail:
		384	policy->governor_data = NULL;
		385	sugov_tunables_free(tunables);
		386
		387	free_sg_policy:
		388	mutex_unlock(&global_tunables_lock);
		389
		390	sugov_policy_free(sg_policy);
		391	pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
		392	return ret;
		393	}
		394
		395	static int sugov_exit(struct cpufreq_policy *policy)
		396	{
		397	struct sugov_policy *sg_policy = policy->governor_data;
		398	struct sugov_tunables *tunables = sg_policy->tunables;
		399	unsigned int count;
		400
		401	mutex_lock(&global_tunables_lock);
		402
		403	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
		404	policy->governor_data = NULL;
		405	if (!count)
		406	sugov_tunables_free(tunables);
		407
		408	mutex_unlock(&global_tunables_lock);
		409
		410	sugov_policy_free(sg_policy);
		411	return 0;
		412	}
		413
		414	static int sugov_start(struct cpufreq_policy *policy)
		415	{
		416	struct sugov_policy *sg_policy = policy->governor_data;
		417	unsigned int cpu;
		418
		419	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
		420	sg_policy->last_freq_update_time = 0;
		421	sg_policy->next_freq = UINT_MAX;
		422	sg_policy->work_in_progress = false;
		423	sg_policy->need_freq_update = false;
		424
		425	for_each_cpu(cpu, policy->cpus) {
		426	struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
		427
		428	sg_cpu->sg_policy = sg_policy;
		429	if (policy_is_shared(policy)) {
		430	sg_cpu->util = ULONG_MAX;
		431	sg_cpu->max = 0;
		432	sg_cpu->last_update = 0;
		433	cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
		434	sugov_update_shared);
		435	} else {
		436	cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
		437	sugov_update_single);
		438	}
		439	}
		440	return 0;
		441	}
		442
		443	static int sugov_stop(struct cpufreq_policy *policy)
		444	{
		445	struct sugov_policy *sg_policy = policy->governor_data;
		446	unsigned int cpu;
		447
		448	for_each_cpu(cpu, policy->cpus)
		449	cpufreq_remove_update_util_hook(cpu);
		450
		451	synchronize_sched();
		452
		453	irq_work_sync(&sg_policy->irq_work);
		454	cancel_work_sync(&sg_policy->work);
		455	return 0;
		456	}
		457
		458	static int sugov_limits(struct cpufreq_policy *policy)
		459	{
		460	struct sugov_policy *sg_policy = policy->governor_data;
		461
		462	if (!policy->fast_switch_enabled) {
		463	mutex_lock(&sg_policy->work_lock);
		464
		465	if (policy->max < policy->cur)
		466	__cpufreq_driver_target(policy, policy->max,
		467	CPUFREQ_RELATION_H);
		468	else if (policy->min > policy->cur)
		469	__cpufreq_driver_target(policy, policy->min,
		470	CPUFREQ_RELATION_L);
		471
		472	mutex_unlock(&sg_policy->work_lock);
		473	}
		474
		475	sg_policy->need_freq_update = true;
		476	return 0;
		477	}
		478
		479	int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
		480	{
		481	if (event == CPUFREQ_GOV_POLICY_INIT) {
		482	return sugov_init(policy);
		483	} else if (policy->governor_data) {
		484	switch (event) {
		485	case CPUFREQ_GOV_POLICY_EXIT:
		486	return sugov_exit(policy);
		487	case CPUFREQ_GOV_START:
		488	return sugov_start(policy);
		489	case CPUFREQ_GOV_STOP:
		490	return sugov_stop(policy);
		491	case CPUFREQ_GOV_LIMITS:
		492	return sugov_limits(policy);
		493	}
		494	}
		495	return -EINVAL;
		496	}
		497
		498	static struct cpufreq_governor schedutil_gov = {
		499	.name = "schedutil",
		500	.governor = sugov_governor,
		501	.owner = THIS_MODULE,
		502	};
		503
		504	static int __init sugov_module_init(void)
		505	{
		506	return cpufreq_register_governor(&schedutil_gov);
		507	}
		508
		509	static void __exit sugov_module_exit(void)
		510	{
		511	cpufreq_unregister_governor(&schedutil_gov);
		512	}
		513
		514	MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
		515	MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
		516	MODULE_LICENSE("GPL");
		517
		518	#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
		519	struct cpufreq_governor *cpufreq_default_governor(void)
		520	{
		521	return &schedutil_gov;
		522	}
		523
		524	fs_initcall(sugov_module_init);
		525	#else
		526	module_init(sugov_module_init);
		527	#endif
		528	module_exit(sugov_module_exit);


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e..921d6e5d33b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -1842,6 +1842,14 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
1842	static inline void cpufreq_trigger_update(u64 time) {}	1842	static inline void cpufreq_trigger_update(u64 time) {}
1843	#endif /* CONFIG_CPU_FREQ */	1843	#endif /* CONFIG_CPU_FREQ */
1844		1844
		1845	#ifdef arch_scale_freq_capacity
		1846	#ifndef arch_scale_freq_invariant
		1847	#define arch_scale_freq_invariant() (true)
		1848	#endif
		1849	#else /* arch_scale_freq_capacity */
		1850	#define arch_scale_freq_invariant() (false)
		1851	#endif
		1852
1845	static inline void account_reset_rq(struct rq *rq)	1853	static inline void account_reset_rq(struct rq *rq)
1846	{	1854	{
1847	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	1855	#ifdef CONFIG_IRQ_TIME_ACCOUNTING