summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/energy_model.h187
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/energy_model.c201
4 files changed, 405 insertions, 0 deletions
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
new file mode 100644
index 000000000000..aa027f7bcb3e
--- /dev/null
+++ b/include/linux/energy_model.h
@@ -0,0 +1,187 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_ENERGY_MODEL_H
3#define _LINUX_ENERGY_MODEL_H
4#include <linux/cpumask.h>
5#include <linux/jump_label.h>
6#include <linux/kobject.h>
7#include <linux/rcupdate.h>
8#include <linux/sched/cpufreq.h>
9#include <linux/sched/topology.h>
10#include <linux/types.h>
11
12#ifdef CONFIG_ENERGY_MODEL
13/**
14 * em_cap_state - Capacity state of a performance domain
15 * @frequency: The CPU frequency in KHz, for consistency with CPUFreq
16 * @power: The power consumed by 1 CPU at this level, in milli-watts
17 * @cost: The cost coefficient associated with this level, used during
18 * energy calculation. Equal to: power * max_frequency / frequency
19 */
20struct em_cap_state {
21 unsigned long frequency;
22 unsigned long power;
23 unsigned long cost;
24};
25
26/**
27 * em_perf_domain - Performance domain
28 * @table: List of capacity states, in ascending order
29 * @nr_cap_states: Number of capacity states
30 * @cpus: Cpumask covering the CPUs of the domain
31 *
32 * A "performance domain" represents a group of CPUs whose performance is
33 * scaled together. All CPUs of a performance domain must have the same
34 * micro-architecture. Performance domains often have a 1-to-1 mapping with
35 * CPUFreq policies.
36 */
37struct em_perf_domain {
38 struct em_cap_state *table;
39 int nr_cap_states;
40 unsigned long cpus[0];
41};
42
43#define EM_CPU_MAX_POWER 0xFFFF
44
45struct em_data_callback {
46 /**
47 * active_power() - Provide power at the next capacity state of a CPU
48 * @power : Active power at the capacity state in mW (modified)
49 * @freq : Frequency at the capacity state in kHz (modified)
50 * @cpu : CPU for which we do this operation
51 *
52 * active_power() must find the lowest capacity state of 'cpu' above
53 * 'freq' and update 'power' and 'freq' to the matching active power
54 * and frequency.
55 *
56 * The power is the one of a single CPU in the domain, expressed in
57 * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
58 * range.
59 *
60 * Return 0 on success.
61 */
62 int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
63};
64#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
65
66struct em_perf_domain *em_cpu_get(int cpu);
67int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
68 struct em_data_callback *cb);
69
70/**
71 * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
72 * @pd : performance domain for which energy has to be estimated
73 * @max_util : highest utilization among CPUs of the domain
74 * @sum_util : sum of the utilization of all CPUs in the domain
75 *
76 * Return: the sum of the energy consumed by the CPUs of the domain assuming
77 * a capacity state satisfying the max utilization of the domain.
78 */
79static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
80 unsigned long max_util, unsigned long sum_util)
81{
82 unsigned long freq, scale_cpu;
83 struct em_cap_state *cs;
84 int i, cpu;
85
86 /*
87 * In order to predict the capacity state, map the utilization of the
88 * most utilized CPU of the performance domain to a requested frequency,
89 * like schedutil.
90 */
91 cpu = cpumask_first(to_cpumask(pd->cpus));
92 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
93 cs = &pd->table[pd->nr_cap_states - 1];
94 freq = map_util_freq(max_util, cs->frequency, scale_cpu);
95
96 /*
97 * Find the lowest capacity state of the Energy Model above the
98 * requested frequency.
99 */
100 for (i = 0; i < pd->nr_cap_states; i++) {
101 cs = &pd->table[i];
102 if (cs->frequency >= freq)
103 break;
104 }
105
106 /*
107 * The capacity of a CPU in the domain at that capacity state (cs)
108 * can be computed as:
109 *
110 * cs->freq * scale_cpu
111 * cs->cap = -------------------- (1)
112 * cpu_max_freq
113 *
114 * So, ignoring the costs of idle states (which are not available in
115 * the EM), the energy consumed by this CPU at that capacity state is
116 * estimated as:
117 *
118 * cs->power * cpu_util
119 * cpu_nrg = -------------------- (2)
120 * cs->cap
121 *
122 * since 'cpu_util / cs->cap' represents its percentage of busy time.
123 *
124 * NOTE: Although the result of this computation actually is in
125 * units of power, it can be manipulated as an energy value
126 * over a scheduling period, since it is assumed to be
127 * constant during that interval.
128 *
129 * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
130 * of two terms:
131 *
132 * cs->power * cpu_max_freq cpu_util
133 * cpu_nrg = ------------------------ * --------- (3)
134 * cs->freq scale_cpu
135 *
136 * The first term is static, and is stored in the em_cap_state struct
137 * as 'cs->cost'.
138 *
139 * Since all CPUs of the domain have the same micro-architecture, they
140 * share the same 'cs->cost', and the same CPU capacity. Hence, the
141 * total energy of the domain (which is the simple sum of the energy of
142 * all of its CPUs) can be factorized as:
143 *
144 * cs->cost * \Sum cpu_util
145 * pd_nrg = ------------------------ (4)
146 * scale_cpu
147 */
148 return cs->cost * sum_util / scale_cpu;
149}
150
151/**
152 * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
153 * @pd : performance domain for which this must be done
154 *
155 * Return: the number of capacity states in the performance domain table
156 */
157static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
158{
159 return pd->nr_cap_states;
160}
161
162#else
163struct em_perf_domain {};
164struct em_data_callback {};
165#define EM_DATA_CB(_active_power_cb) { }
166
167static inline int em_register_perf_domain(cpumask_t *span,
168 unsigned int nr_states, struct em_data_callback *cb)
169{
170 return -EINVAL;
171}
172static inline struct em_perf_domain *em_cpu_get(int cpu)
173{
174 return NULL;
175}
176static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
177 unsigned long max_util, unsigned long sum_util)
178{
179 return 0;
180}
181static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
182{
183 return 0;
184}
185#endif
186
187#endif
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3a6c2f87699e..f8fe57d1022e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
298 298
299config CPU_PM 299config CPU_PM
300 bool 300 bool
301
302config ENERGY_MODEL
303 bool "Energy Model for CPUs"
304 depends on SMP
305 depends on CPU_FREQ
306 default n
307 help
308 Several subsystems (thermal and/or the task scheduler for example)
309 can leverage information about the energy consumed by CPUs to make
310 smarter decisions. This config option enables the framework from
311 which subsystems can access the energy models.
312
313 The exact usage of the energy model is subsystem-dependent.
314
315 If in doubt, say N.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index a3f79f0eef36..e7e47d9be1e5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
15obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o 15obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
16 16
17obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 17obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
18
19obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
new file mode 100644
index 000000000000..d9dc2c38764a
--- /dev/null
+++ b/kernel/power/energy_model.c
@@ -0,0 +1,201 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Energy Model of CPUs
4 *
5 * Copyright (c) 2018, Arm ltd.
6 * Written by: Quentin Perret, Arm ltd.
7 */
8
9#define pr_fmt(fmt) "energy_model: " fmt
10
11#include <linux/cpu.h>
12#include <linux/cpumask.h>
13#include <linux/energy_model.h>
14#include <linux/sched/topology.h>
15#include <linux/slab.h>
16
17/* Mapping of each CPU to the performance domain to which it belongs. */
18static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
19
20/*
21 * Mutex serializing the registrations of performance domains and letting
22 * callbacks defined by drivers sleep.
23 */
24static DEFINE_MUTEX(em_pd_mutex);
25
26static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
27 struct em_data_callback *cb)
28{
29 unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
30 unsigned long power, freq, prev_freq = 0;
31 int i, ret, cpu = cpumask_first(span);
32 struct em_cap_state *table;
33 struct em_perf_domain *pd;
34 u64 fmax;
35
36 if (!cb->active_power)
37 return NULL;
38
39 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
40 if (!pd)
41 return NULL;
42
43 table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
44 if (!table)
45 goto free_pd;
46
47 /* Build the list of capacity states for this performance domain */
48 for (i = 0, freq = 0; i < nr_states; i++, freq++) {
49 /*
50 * active_power() is a driver callback which ceils 'freq' to
51 * lowest capacity state of 'cpu' above 'freq' and updates
52 * 'power' and 'freq' accordingly.
53 */
54 ret = cb->active_power(&power, &freq, cpu);
55 if (ret) {
56 pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
57 goto free_cs_table;
58 }
59
60 /*
61 * We expect the driver callback to increase the frequency for
62 * higher capacity states.
63 */
64 if (freq <= prev_freq) {
65 pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
66 goto free_cs_table;
67 }
68
69 /*
70 * The power returned by active_state() is expected to be
71 * positive, in milli-watts and to fit into 16 bits.
72 */
73 if (!power || power > EM_CPU_MAX_POWER) {
74 pr_err("pd%d: invalid power: %lu\n", cpu, power);
75 goto free_cs_table;
76 }
77
78 table[i].power = power;
79 table[i].frequency = prev_freq = freq;
80
81 /*
82 * The hertz/watts efficiency ratio should decrease as the
83 * frequency grows on sane platforms. But this isn't always
84 * true in practice so warn the user if a higher OPP is more
85 * power efficient than a lower one.
86 */
87 opp_eff = freq / power;
88 if (opp_eff >= prev_opp_eff)
89 pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
90 cpu, i, i - 1);
91 prev_opp_eff = opp_eff;
92 }
93
94 /* Compute the cost of each capacity_state. */
95 fmax = (u64) table[nr_states - 1].frequency;
96 for (i = 0; i < nr_states; i++) {
97 table[i].cost = div64_u64(fmax * table[i].power,
98 table[i].frequency);
99 }
100
101 pd->table = table;
102 pd->nr_cap_states = nr_states;
103 cpumask_copy(to_cpumask(pd->cpus), span);
104
105 return pd;
106
107free_cs_table:
108 kfree(table);
109free_pd:
110 kfree(pd);
111
112 return NULL;
113}
114
115/**
116 * em_cpu_get() - Return the performance domain for a CPU
117 * @cpu : CPU to find the performance domain for
118 *
119 * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
120 * exist.
121 */
122struct em_perf_domain *em_cpu_get(int cpu)
123{
124 return READ_ONCE(per_cpu(em_data, cpu));
125}
126EXPORT_SYMBOL_GPL(em_cpu_get);
127
128/**
129 * em_register_perf_domain() - Register the Energy Model of a performance domain
130 * @span : Mask of CPUs in the performance domain
131 * @nr_states : Number of capacity states to register
132 * @cb : Callback functions providing the data of the Energy Model
133 *
134 * Create Energy Model tables for a performance domain using the callbacks
135 * defined in cb.
136 *
137 * If multiple clients register the same performance domain, all but the first
138 * registration will be ignored.
139 *
140 * Return 0 on success
141 */
142int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
143 struct em_data_callback *cb)
144{
145 unsigned long cap, prev_cap = 0;
146 struct em_perf_domain *pd;
147 int cpu, ret = 0;
148
149 if (!span || !nr_states || !cb)
150 return -EINVAL;
151
152 /*
153 * Use a mutex to serialize the registration of performance domains and
154 * let the driver-defined callback functions sleep.
155 */
156 mutex_lock(&em_pd_mutex);
157
158 for_each_cpu(cpu, span) {
159 /* Make sure we don't register again an existing domain. */
160 if (READ_ONCE(per_cpu(em_data, cpu))) {
161 ret = -EEXIST;
162 goto unlock;
163 }
164
165 /*
166 * All CPUs of a domain must have the same micro-architecture
167 * since they all share the same table.
168 */
169 cap = arch_scale_cpu_capacity(NULL, cpu);
170 if (prev_cap && prev_cap != cap) {
171 pr_err("CPUs of %*pbl must have the same capacity\n",
172 cpumask_pr_args(span));
173 ret = -EINVAL;
174 goto unlock;
175 }
176 prev_cap = cap;
177 }
178
179 /* Create the performance domain and add it to the Energy Model. */
180 pd = em_create_pd(span, nr_states, cb);
181 if (!pd) {
182 ret = -EINVAL;
183 goto unlock;
184 }
185
186 for_each_cpu(cpu, span) {
187 /*
188 * The per-cpu array can be read concurrently from em_cpu_get().
189 * The barrier enforces the ordering needed to make sure readers
190 * can only access well formed em_perf_domain structs.
191 */
192 smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
193 }
194
195 pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
196unlock:
197 mutex_unlock(&em_pd_mutex);
198
199 return ret;
200}
201EXPORT_SYMBOL_GPL(em_register_perf_domain);