diff options
-rw-r--r-- | include/linux/energy_model.h | 187 | ||||
-rw-r--r-- | kernel/power/Kconfig | 15 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/energy_model.c | 201 |
4 files changed, 405 insertions, 0 deletions
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h new file mode 100644 index 000000000000..aa027f7bcb3e --- /dev/null +++ b/include/linux/energy_model.h | |||
@@ -0,0 +1,187 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef _LINUX_ENERGY_MODEL_H | ||
3 | #define _LINUX_ENERGY_MODEL_H | ||
4 | #include <linux/cpumask.h> | ||
5 | #include <linux/jump_label.h> | ||
6 | #include <linux/kobject.h> | ||
7 | #include <linux/rcupdate.h> | ||
8 | #include <linux/sched/cpufreq.h> | ||
9 | #include <linux/sched/topology.h> | ||
10 | #include <linux/types.h> | ||
11 | |||
12 | #ifdef CONFIG_ENERGY_MODEL | ||
13 | /** | ||
14 | * em_cap_state - Capacity state of a performance domain | ||
15 | * @frequency: The CPU frequency in KHz, for consistency with CPUFreq | ||
16 | * @power: The power consumed by 1 CPU at this level, in milli-watts | ||
17 | * @cost: The cost coefficient associated with this level, used during | ||
18 | * energy calculation. Equal to: power * max_frequency / frequency | ||
19 | */ | ||
20 | struct em_cap_state { | ||
21 | unsigned long frequency; | ||
22 | unsigned long power; | ||
23 | unsigned long cost; | ||
24 | }; | ||
25 | |||
26 | /** | ||
27 | * em_perf_domain - Performance domain | ||
28 | * @table: List of capacity states, in ascending order | ||
29 | * @nr_cap_states: Number of capacity states | ||
30 | * @cpus: Cpumask covering the CPUs of the domain | ||
31 | * | ||
32 | * A "performance domain" represents a group of CPUs whose performance is | ||
33 | * scaled together. All CPUs of a performance domain must have the same | ||
34 | * micro-architecture. Performance domains often have a 1-to-1 mapping with | ||
35 | * CPUFreq policies. | ||
36 | */ | ||
37 | struct em_perf_domain { | ||
38 | struct em_cap_state *table; | ||
39 | int nr_cap_states; | ||
40 | unsigned long cpus[0]; | ||
41 | }; | ||
42 | |||
43 | #define EM_CPU_MAX_POWER 0xFFFF | ||
44 | |||
45 | struct em_data_callback { | ||
46 | /** | ||
47 | * active_power() - Provide power at the next capacity state of a CPU | ||
48 | * @power : Active power at the capacity state in mW (modified) | ||
49 | * @freq : Frequency at the capacity state in kHz (modified) | ||
50 | * @cpu : CPU for which we do this operation | ||
51 | * | ||
52 | * active_power() must find the lowest capacity state of 'cpu' above | ||
53 | * 'freq' and update 'power' and 'freq' to the matching active power | ||
54 | * and frequency. | ||
55 | * | ||
56 | * The power is the one of a single CPU in the domain, expressed in | ||
57 | * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER] | ||
58 | * range. | ||
59 | * | ||
60 | * Return 0 on success. | ||
61 | */ | ||
62 | int (*active_power)(unsigned long *power, unsigned long *freq, int cpu); | ||
63 | }; | ||
64 | #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } | ||
65 | |||
66 | struct em_perf_domain *em_cpu_get(int cpu); | ||
67 | int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, | ||
68 | struct em_data_callback *cb); | ||
69 | |||
70 | /** | ||
71 | * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain | ||
72 | * @pd : performance domain for which energy has to be estimated | ||
73 | * @max_util : highest utilization among CPUs of the domain | ||
74 | * @sum_util : sum of the utilization of all CPUs in the domain | ||
75 | * | ||
76 | * Return: the sum of the energy consumed by the CPUs of the domain assuming | ||
77 | * a capacity state satisfying the max utilization of the domain. | ||
78 | */ | ||
79 | static inline unsigned long em_pd_energy(struct em_perf_domain *pd, | ||
80 | unsigned long max_util, unsigned long sum_util) | ||
81 | { | ||
82 | unsigned long freq, scale_cpu; | ||
83 | struct em_cap_state *cs; | ||
84 | int i, cpu; | ||
85 | |||
86 | /* | ||
87 | * In order to predict the capacity state, map the utilization of the | ||
88 | * most utilized CPU of the performance domain to a requested frequency, | ||
89 | * like schedutil. | ||
90 | */ | ||
91 | cpu = cpumask_first(to_cpumask(pd->cpus)); | ||
92 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
93 | cs = &pd->table[pd->nr_cap_states - 1]; | ||
94 | freq = map_util_freq(max_util, cs->frequency, scale_cpu); | ||
95 | |||
96 | /* | ||
97 | * Find the lowest capacity state of the Energy Model above the | ||
98 | * requested frequency. | ||
99 | */ | ||
100 | for (i = 0; i < pd->nr_cap_states; i++) { | ||
101 | cs = &pd->table[i]; | ||
102 | if (cs->frequency >= freq) | ||
103 | break; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * The capacity of a CPU in the domain at that capacity state (cs) | ||
108 | * can be computed as: | ||
109 | * | ||
110 | * cs->freq * scale_cpu | ||
111 | * cs->cap = -------------------- (1) | ||
112 | * cpu_max_freq | ||
113 | * | ||
114 | * So, ignoring the costs of idle states (which are not available in | ||
115 | * the EM), the energy consumed by this CPU at that capacity state is | ||
116 | * estimated as: | ||
117 | * | ||
118 | * cs->power * cpu_util | ||
119 | * cpu_nrg = -------------------- (2) | ||
120 | * cs->cap | ||
121 | * | ||
122 | * since 'cpu_util / cs->cap' represents its percentage of busy time. | ||
123 | * | ||
124 | * NOTE: Although the result of this computation actually is in | ||
125 | * units of power, it can be manipulated as an energy value | ||
126 | * over a scheduling period, since it is assumed to be | ||
127 | * constant during that interval. | ||
128 | * | ||
129 | * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product | ||
130 | * of two terms: | ||
131 | * | ||
132 | * cs->power * cpu_max_freq cpu_util | ||
133 | * cpu_nrg = ------------------------ * --------- (3) | ||
134 | * cs->freq scale_cpu | ||
135 | * | ||
136 | * The first term is static, and is stored in the em_cap_state struct | ||
137 | * as 'cs->cost'. | ||
138 | * | ||
139 | * Since all CPUs of the domain have the same micro-architecture, they | ||
140 | * share the same 'cs->cost', and the same CPU capacity. Hence, the | ||
141 | * total energy of the domain (which is the simple sum of the energy of | ||
142 | * all of its CPUs) can be factorized as: | ||
143 | * | ||
144 | * cs->cost * \Sum cpu_util | ||
145 | * pd_nrg = ------------------------ (4) | ||
146 | * scale_cpu | ||
147 | */ | ||
148 | return cs->cost * sum_util / scale_cpu; | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain | ||
153 | * @pd : performance domain for which this must be done | ||
154 | * | ||
155 | * Return: the number of capacity states in the performance domain table | ||
156 | */ | ||
157 | static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) | ||
158 | { | ||
159 | return pd->nr_cap_states; | ||
160 | } | ||
161 | |||
162 | #else | ||
163 | struct em_perf_domain {}; | ||
164 | struct em_data_callback {}; | ||
165 | #define EM_DATA_CB(_active_power_cb) { } | ||
166 | |||
167 | static inline int em_register_perf_domain(cpumask_t *span, | ||
168 | unsigned int nr_states, struct em_data_callback *cb) | ||
169 | { | ||
170 | return -EINVAL; | ||
171 | } | ||
172 | static inline struct em_perf_domain *em_cpu_get(int cpu) | ||
173 | { | ||
174 | return NULL; | ||
175 | } | ||
176 | static inline unsigned long em_pd_energy(struct em_perf_domain *pd, | ||
177 | unsigned long max_util, unsigned long sum_util) | ||
178 | { | ||
179 | return 0; | ||
180 | } | ||
181 | static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) | ||
182 | { | ||
183 | return 0; | ||
184 | } | ||
185 | #endif | ||
186 | |||
187 | #endif | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3a6c2f87699e..f8fe57d1022e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF | |||
298 | 298 | ||
299 | config CPU_PM | 299 | config CPU_PM |
300 | bool | 300 | bool |
301 | |||
302 | config ENERGY_MODEL | ||
303 | bool "Energy Model for CPUs" | ||
304 | depends on SMP | ||
305 | depends on CPU_FREQ | ||
306 | default n | ||
307 | help | ||
308 | Several subsystems (thermal and/or the task scheduler for example) | ||
309 | can leverage information about the energy consumed by CPUs to make | ||
310 | smarter decisions. This config option enables the framework from | ||
311 | which subsystems can access the energy models. | ||
312 | |||
313 | The exact usage of the energy model is subsystem-dependent. | ||
314 | |||
315 | If in doubt, say N. | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index a3f79f0eef36..e7e47d9be1e5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o | |||
15 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o | 15 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o |
16 | 16 | ||
17 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 17 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
18 | |||
19 | obj-$(CONFIG_ENERGY_MODEL) += energy_model.o | ||
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 000000000000..d9dc2c38764a --- /dev/null +++ b/kernel/power/energy_model.c | |||
@@ -0,0 +1,201 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Energy Model of CPUs | ||
4 | * | ||
5 | * Copyright (c) 2018, Arm ltd. | ||
6 | * Written by: Quentin Perret, Arm ltd. | ||
7 | */ | ||
8 | |||
9 | #define pr_fmt(fmt) "energy_model: " fmt | ||
10 | |||
11 | #include <linux/cpu.h> | ||
12 | #include <linux/cpumask.h> | ||
13 | #include <linux/energy_model.h> | ||
14 | #include <linux/sched/topology.h> | ||
15 | #include <linux/slab.h> | ||
16 | |||
17 | /* Mapping of each CPU to the performance domain to which it belongs. */ | ||
18 | static DEFINE_PER_CPU(struct em_perf_domain *, em_data); | ||
19 | |||
20 | /* | ||
21 | * Mutex serializing the registrations of performance domains and letting | ||
22 | * callbacks defined by drivers sleep. | ||
23 | */ | ||
24 | static DEFINE_MUTEX(em_pd_mutex); | ||
25 | |||
26 | static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, | ||
27 | struct em_data_callback *cb) | ||
28 | { | ||
29 | unsigned long opp_eff, prev_opp_eff = ULONG_MAX; | ||
30 | unsigned long power, freq, prev_freq = 0; | ||
31 | int i, ret, cpu = cpumask_first(span); | ||
32 | struct em_cap_state *table; | ||
33 | struct em_perf_domain *pd; | ||
34 | u64 fmax; | ||
35 | |||
36 | if (!cb->active_power) | ||
37 | return NULL; | ||
38 | |||
39 | pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); | ||
40 | if (!pd) | ||
41 | return NULL; | ||
42 | |||
43 | table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); | ||
44 | if (!table) | ||
45 | goto free_pd; | ||
46 | |||
47 | /* Build the list of capacity states for this performance domain */ | ||
48 | for (i = 0, freq = 0; i < nr_states; i++, freq++) { | ||
49 | /* | ||
50 | * active_power() is a driver callback which ceils 'freq' to | ||
51 | * lowest capacity state of 'cpu' above 'freq' and updates | ||
52 | * 'power' and 'freq' accordingly. | ||
53 | */ | ||
54 | ret = cb->active_power(&power, &freq, cpu); | ||
55 | if (ret) { | ||
56 | pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); | ||
57 | goto free_cs_table; | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * We expect the driver callback to increase the frequency for | ||
62 | * higher capacity states. | ||
63 | */ | ||
64 | if (freq <= prev_freq) { | ||
65 | pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); | ||
66 | goto free_cs_table; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * The power returned by active_state() is expected to be | ||
71 | * positive, in milli-watts and to fit into 16 bits. | ||
72 | */ | ||
73 | if (!power || power > EM_CPU_MAX_POWER) { | ||
74 | pr_err("pd%d: invalid power: %lu\n", cpu, power); | ||
75 | goto free_cs_table; | ||
76 | } | ||
77 | |||
78 | table[i].power = power; | ||
79 | table[i].frequency = prev_freq = freq; | ||
80 | |||
81 | /* | ||
82 | * The hertz/watts efficiency ratio should decrease as the | ||
83 | * frequency grows on sane platforms. But this isn't always | ||
84 | * true in practice so warn the user if a higher OPP is more | ||
85 | * power efficient than a lower one. | ||
86 | */ | ||
87 | opp_eff = freq / power; | ||
88 | if (opp_eff >= prev_opp_eff) | ||
89 | pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", | ||
90 | cpu, i, i - 1); | ||
91 | prev_opp_eff = opp_eff; | ||
92 | } | ||
93 | |||
94 | /* Compute the cost of each capacity_state. */ | ||
95 | fmax = (u64) table[nr_states - 1].frequency; | ||
96 | for (i = 0; i < nr_states; i++) { | ||
97 | table[i].cost = div64_u64(fmax * table[i].power, | ||
98 | table[i].frequency); | ||
99 | } | ||
100 | |||
101 | pd->table = table; | ||
102 | pd->nr_cap_states = nr_states; | ||
103 | cpumask_copy(to_cpumask(pd->cpus), span); | ||
104 | |||
105 | return pd; | ||
106 | |||
107 | free_cs_table: | ||
108 | kfree(table); | ||
109 | free_pd: | ||
110 | kfree(pd); | ||
111 | |||
112 | return NULL; | ||
113 | } | ||
114 | |||
115 | /** | ||
116 | * em_cpu_get() - Return the performance domain for a CPU | ||
117 | * @cpu : CPU to find the performance domain for | ||
118 | * | ||
119 | * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't | ||
120 | * exist. | ||
121 | */ | ||
122 | struct em_perf_domain *em_cpu_get(int cpu) | ||
123 | { | ||
124 | return READ_ONCE(per_cpu(em_data, cpu)); | ||
125 | } | ||
126 | EXPORT_SYMBOL_GPL(em_cpu_get); | ||
127 | |||
128 | /** | ||
129 | * em_register_perf_domain() - Register the Energy Model of a performance domain | ||
130 | * @span : Mask of CPUs in the performance domain | ||
131 | * @nr_states : Number of capacity states to register | ||
132 | * @cb : Callback functions providing the data of the Energy Model | ||
133 | * | ||
134 | * Create Energy Model tables for a performance domain using the callbacks | ||
135 | * defined in cb. | ||
136 | * | ||
137 | * If multiple clients register the same performance domain, all but the first | ||
138 | * registration will be ignored. | ||
139 | * | ||
140 | * Return 0 on success | ||
141 | */ | ||
142 | int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, | ||
143 | struct em_data_callback *cb) | ||
144 | { | ||
145 | unsigned long cap, prev_cap = 0; | ||
146 | struct em_perf_domain *pd; | ||
147 | int cpu, ret = 0; | ||
148 | |||
149 | if (!span || !nr_states || !cb) | ||
150 | return -EINVAL; | ||
151 | |||
152 | /* | ||
153 | * Use a mutex to serialize the registration of performance domains and | ||
154 | * let the driver-defined callback functions sleep. | ||
155 | */ | ||
156 | mutex_lock(&em_pd_mutex); | ||
157 | |||
158 | for_each_cpu(cpu, span) { | ||
159 | /* Make sure we don't register again an existing domain. */ | ||
160 | if (READ_ONCE(per_cpu(em_data, cpu))) { | ||
161 | ret = -EEXIST; | ||
162 | goto unlock; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * All CPUs of a domain must have the same micro-architecture | ||
167 | * since they all share the same table. | ||
168 | */ | ||
169 | cap = arch_scale_cpu_capacity(NULL, cpu); | ||
170 | if (prev_cap && prev_cap != cap) { | ||
171 | pr_err("CPUs of %*pbl must have the same capacity\n", | ||
172 | cpumask_pr_args(span)); | ||
173 | ret = -EINVAL; | ||
174 | goto unlock; | ||
175 | } | ||
176 | prev_cap = cap; | ||
177 | } | ||
178 | |||
179 | /* Create the performance domain and add it to the Energy Model. */ | ||
180 | pd = em_create_pd(span, nr_states, cb); | ||
181 | if (!pd) { | ||
182 | ret = -EINVAL; | ||
183 | goto unlock; | ||
184 | } | ||
185 | |||
186 | for_each_cpu(cpu, span) { | ||
187 | /* | ||
188 | * The per-cpu array can be read concurrently from em_cpu_get(). | ||
189 | * The barrier enforces the ordering needed to make sure readers | ||
190 | * can only access well formed em_perf_domain structs. | ||
191 | */ | ||
192 | smp_store_release(per_cpu_ptr(&em_data, cpu), pd); | ||
193 | } | ||
194 | |||
195 | pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); | ||
196 | unlock: | ||
197 | mutex_unlock(&em_pd_mutex); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | EXPORT_SYMBOL_GPL(em_register_perf_domain); | ||