diff options
author | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2019-02-01 05:57:46 -0500 |
---|---|---|
committer | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2019-02-01 05:57:46 -0500 |
commit | 8a56bdeb09007e33107e6fdf72909e74c2fe1b95 (patch) | |
tree | fdb34498580dc72ac3479ed7e7f5e1aaaa4cfe4f | |
parent | 1617971c6616c87185cbc78fa1a86dfc70dd16b6 (diff) | |
parent | 44021606298870e4adc641ef3927e7bb47ca8236 (diff) |
Merge back earlier cpuidle material for v5.1.
-rw-r--r-- | Documentation/admin-guide/pm/cpuidle.rst | 104 | ||||
-rw-r--r-- | Documentation/cpuidle/driver.txt | 37 | ||||
-rw-r--r-- | Documentation/cpuidle/governor.txt | 28 | ||||
-rw-r--r-- | Documentation/driver-api/pm/cpuidle.rst | 282 | ||||
-rw-r--r-- | Documentation/driver-api/pm/index.rst | 7 | ||||
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | drivers/cpuidle/Kconfig | 11 | ||||
-rw-r--r-- | drivers/cpuidle/governors/Makefile | 1 | ||||
-rw-r--r-- | drivers/cpuidle/governors/teo.c | 444 | ||||
-rw-r--r-- | include/linux/cpuidle.h | 8 |
10 files changed, 841 insertions, 82 deletions
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 106379e2619f..9c58b35a81cb 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst | |||
@@ -155,14 +155,14 @@ governor uses that information depends on what algorithm is implemented by it | |||
155 | and that is the primary reason for having more than one governor in the | 155 | and that is the primary reason for having more than one governor in the |
156 | ``CPUIdle`` subsystem. | 156 | ``CPUIdle`` subsystem. |
157 | 157 | ||
158 | There are two ``CPUIdle`` governors available, ``menu`` and ``ladder``. Which | 158 | There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_ |
159 | of them is used depends on the configuration of the kernel and in particular on | 159 | and ``ladder``. Which of them is used by default depends on the configuration |
160 | whether or not the scheduler tick can be `stopped by the idle | 160 | of the kernel and in particular on whether or not the scheduler tick can be |
161 | loop <idle-cpus-and-tick_>`_. It is possible to change the governor at run time | 161 | `stopped by the idle loop <idle-cpus-and-tick_>`_. It is possible to change the |
162 | if the ``cpuidle_sysfs_switch`` command line parameter has been passed to the | 162 | governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has |
163 | kernel, but that is not safe in general, so it should not be done on production | 163 | been passed to the kernel, but that is not safe in general, so it should not be |
164 | systems (that may change in the future, though). The name of the ``CPUIdle`` | 164 | done on production systems (that may change in the future, though). The name of |
165 | governor currently used by the kernel can be read from the | 165 | the ``CPUIdle`` governor currently used by the kernel can be read from the |
166 | :file:`current_governor_ro` (or :file:`current_governor` if | 166 | :file:`current_governor_ro` (or :file:`current_governor` if |
167 | ``cpuidle_sysfs_switch`` is present in the kernel command line) file under | 167 | ``cpuidle_sysfs_switch`` is present in the kernel command line) file under |
168 | :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``. | 168 | :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``. |
@@ -256,6 +256,8 @@ the ``menu`` governor by default and if it is not tickless, the default | |||
256 | ``CPUIdle`` governor on it will be ``ladder``. | 256 | ``CPUIdle`` governor on it will be ``ladder``. |
257 | 257 | ||
258 | 258 | ||
259 | .. _menu-gov: | ||
260 | |||
259 | The ``menu`` Governor | 261 | The ``menu`` Governor |
260 | ===================== | 262 | ===================== |
261 | 263 | ||
@@ -333,6 +335,92 @@ that time, the governor may need to select a shallower state with a suitable | |||
333 | target residency. | 335 | target residency. |
334 | 336 | ||
335 | 337 | ||
338 | .. _teo-gov: | ||
339 | |||
340 | The Timer Events Oriented (TEO) Governor | ||
341 | ======================================== | ||
342 | |||
343 | The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor | ||
344 | for tickless systems. It follows the same basic strategy as the ``menu`` `one | ||
345 | <menu-gov_>`_: it always tries to find the deepest idle state suitable for the | ||
346 | given conditions. However, it applies a different approach to that problem. | ||
347 | |||
348 | First, it does not use sleep length correction factors, but instead it attempts | ||
349 | to correlate the observed idle duration values with the available idle states | ||
350 | and use that information to pick up the idle state that is most likely to | ||
351 | "match" the upcoming CPU idle interval. Second, it does not take the tasks | ||
352 | that were running on the given CPU in the past and are waiting on some I/O | ||
353 | operations to complete now at all (there is no guarantee that they will run on | ||
354 | the same CPU when they become runnable again) and the pattern detection code in | ||
355 | it avoids taking timer wakeups into account. It also only uses idle duration | ||
356 | values less than the current time till the closest timer (with the scheduler | ||
357 | tick excluded) for that purpose. | ||
358 | |||
359 | Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain | ||
360 | the *sleep length*, which is the time until the closest timer event with the | ||
361 | assumption that the scheduler tick will be stopped (that also is the upper bound | ||
362 | on the time until the next CPU wakeup). That value is then used to preselect an | ||
363 | idle state on the basis of three metrics maintained for each idle state provided | ||
364 | by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``. | ||
365 | |||
366 | The ``hits`` and ``misses`` metrics measure the likelihood that a given idle | ||
367 | state will "match" the observed (post-wakeup) idle duration if it "matches" the | ||
368 | sleep length. They both are subject to decay (after a CPU wakeup) every time | ||
369 | the target residency of the idle state corresponding to them is less than or | ||
370 | equal to the sleep length and the target residency of the next idle state is | ||
371 | greater than the sleep length (that is, when the idle state corresponding to | ||
372 | them "matches" the sleep length). The ``hits`` metric is increased if the | ||
373 | former condition is satisfied and the target residency of the given idle state | ||
374 | is less than or equal to the observed idle duration and the target residency of | ||
375 | the next idle state is greater than the observed idle duration at the same time | ||
376 | (that is, it is increased when the given idle state "matches" both the sleep | ||
377 | length and the observed idle duration). In turn, the ``misses`` metric is | ||
378 | increased when the given idle state "matches" the sleep length only and the | ||
379 | observed idle duration is too short for its target residency. | ||
380 | |||
381 | The ``early_hits`` metric measures the likelihood that a given idle state will | ||
382 | "match" the observed (post-wakeup) idle duration if it does not "match" the | ||
383 | sleep length. It is subject to decay on every CPU wakeup and it is increased | ||
384 | when the idle state corresponding to it "matches" the observed (post-wakeup) | ||
385 | idle duration and the target residency of the next idle state is less than or | ||
386 | equal to the sleep length (i.e. the idle state "matching" the sleep length is | ||
387 | deeper than the given one). | ||
388 | |||
389 | The governor walks the list of idle states provided by the ``CPUIdle`` driver | ||
390 | and finds the last (deepest) one with the target residency less than or equal | ||
391 | to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle | ||
392 | state are compared with each other and it is preselected if the ``hits`` one is | ||
393 | greater (which means that that idle state is likely to "match" the observed idle | ||
394 | duration after CPU wakeup). If the ``misses`` one is greater, the governor | ||
395 | preselects the shallower idle state with the maximum ``early_hits`` metric | ||
396 | (or if there are multiple shallower idle states with equal ``early_hits`` | ||
397 | metric which also is the maximum, the shallowest of them will be preselected). | ||
398 | [If there is a wakeup latency constraint coming from the `PM QoS framework | ||
399 | <cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the | ||
400 | target residency within the sleep length, the deepest idle state with the exit | ||
401 | latency within the constraint is preselected without consulting the ``hits``, | ||
402 | ``misses`` and ``early_hits`` metrics.] | ||
403 | |||
404 | Next, the governor takes several idle duration values observed most recently | ||
405 | into consideration and if at least a half of them are greater than or equal to | ||
406 | the target residency of the preselected idle state, that idle state becomes the | ||
407 | final candidate to ask for. Otherwise, the average of the most recent idle | ||
408 | duration values below the target residency of the preselected idle state is | ||
409 | computed and the governor walks the idle states shallower than the preselected | ||
410 | one and finds the deepest of them with the target residency within that average. | ||
411 | That idle state is then taken as the final candidate to ask for. | ||
412 | |||
413 | Still, at this point the governor may need to refine the idle state selection if | ||
414 | it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That | ||
415 | generally happens if the target residency of the idle state selected so far is | ||
416 | less than the tick period and the tick has not been stopped already (in a | ||
417 | previous iteration of the idle loop). Then, like in the ``menu`` governor | ||
418 | `case <menu-gov_>`_, the sleep length used in the previous computations may not | ||
419 | reflect the real time until the closest timer event and if it really is greater | ||
420 | than that time, a shallower state with a suitable target residency may need to | ||
421 | be selected. | ||
422 | |||
423 | |||
336 | .. _idle-states-representation: | 424 | .. _idle-states-representation: |
337 | 425 | ||
338 | Representation of Idle States | 426 | Representation of Idle States |
diff --git a/Documentation/cpuidle/driver.txt b/Documentation/cpuidle/driver.txt deleted file mode 100644 index 1b0d81d92583..000000000000 --- a/Documentation/cpuidle/driver.txt +++ /dev/null | |||
@@ -1,37 +0,0 @@ | |||
1 | |||
2 | |||
3 | Supporting multiple CPU idle levels in kernel | ||
4 | |||
5 | cpuidle drivers | ||
6 | |||
7 | |||
8 | |||
9 | |||
10 | cpuidle driver hooks into the cpuidle infrastructure and handles the | ||
11 | architecture/platform dependent part of CPU idle states. Driver | ||
12 | provides the platform idle state detection capability and also | ||
13 | has mechanisms in place to support actual entry-exit into CPU idle states. | ||
14 | |||
15 | cpuidle driver initializes the cpuidle_device structure for each CPU device | ||
16 | and registers with cpuidle using cpuidle_register_device. | ||
17 | |||
18 | If all the idle states are the same, the wrapper function cpuidle_register | ||
19 | could be used instead. | ||
20 | |||
21 | It can also support the dynamic changes (like battery <-> AC), by using | ||
22 | cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device, | ||
23 | cpuidle_resume_and_unlock. | ||
24 | |||
25 | Interfaces: | ||
26 | extern int cpuidle_register(struct cpuidle_driver *drv, | ||
27 | const struct cpumask *const coupled_cpus); | ||
28 | extern int cpuidle_unregister(struct cpuidle_driver *drv); | ||
29 | extern int cpuidle_register_driver(struct cpuidle_driver *drv); | ||
30 | extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); | ||
31 | extern int cpuidle_register_device(struct cpuidle_device *dev); | ||
32 | extern void cpuidle_unregister_device(struct cpuidle_device *dev); | ||
33 | |||
34 | extern void cpuidle_pause_and_lock(void); | ||
35 | extern void cpuidle_resume_and_unlock(void); | ||
36 | extern int cpuidle_enable_device(struct cpuidle_device *dev); | ||
37 | extern void cpuidle_disable_device(struct cpuidle_device *dev); | ||
diff --git a/Documentation/cpuidle/governor.txt b/Documentation/cpuidle/governor.txt deleted file mode 100644 index d9020f5e847b..000000000000 --- a/Documentation/cpuidle/governor.txt +++ /dev/null | |||
@@ -1,28 +0,0 @@ | |||
1 | |||
2 | |||
3 | |||
4 | Supporting multiple CPU idle levels in kernel | ||
5 | |||
6 | cpuidle governors | ||
7 | |||
8 | |||
9 | |||
10 | |||
11 | cpuidle governor is policy routine that decides what idle state to enter at | ||
12 | any given time. cpuidle core uses different callbacks to the governor. | ||
13 | |||
14 | * enable() to enable governor for a particular device | ||
15 | * disable() to disable governor for a particular device | ||
16 | * select() to select an idle state to enter | ||
17 | * reflect() called after returning from the idle state, which can be used | ||
18 | by the governor for some record keeping. | ||
19 | |||
20 | More than one governor can be registered at the same time and | ||
21 | users can switch between drivers using /sysfs interface (when enabled). | ||
22 | More than one governor part is supported for developers to easily experiment | ||
23 | with different governors. By default, most optimal governor based on your | ||
24 | kernel configuration and platform will be selected by cpuidle. | ||
25 | |||
26 | Interfaces: | ||
27 | extern int cpuidle_register_governor(struct cpuidle_governor *gov); | ||
28 | struct cpuidle_governor | ||
diff --git a/Documentation/driver-api/pm/cpuidle.rst b/Documentation/driver-api/pm/cpuidle.rst new file mode 100644 index 000000000000..5842ab621a58 --- /dev/null +++ b/Documentation/driver-api/pm/cpuidle.rst | |||
@@ -0,0 +1,282 @@ | |||
1 | .. |struct cpuidle_governor| replace:: :c:type:`struct cpuidle_governor <cpuidle_governor>` | ||
2 | .. |struct cpuidle_device| replace:: :c:type:`struct cpuidle_device <cpuidle_device>` | ||
3 | .. |struct cpuidle_driver| replace:: :c:type:`struct cpuidle_driver <cpuidle_driver>` | ||
4 | .. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>` | ||
5 | |||
6 | ======================== | ||
7 | CPU Idle Time Management | ||
8 | ======================== | ||
9 | |||
10 | :: | ||
11 | |||
12 | Copyright (c) 2019 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com> | ||
13 | |||
14 | |||
15 | CPU Idle Time Management Subsystem | ||
16 | ================================== | ||
17 | |||
18 | Every time one of the logical CPUs in the system (the entities that appear to | ||
19 | fetch and execute instructions: hardware threads, if present, or processor | ||
20 | cores) is idle after an interrupt or equivalent wakeup event, which means that | ||
21 | there are no tasks to run on it except for the special "idle" task associated | ||
22 | with it, there is an opportunity to save energy for the processor that it | ||
23 | belongs to. That can be done by making the idle logical CPU stop fetching | ||
24 | instructions from memory and putting some of the processor's functional units | ||
25 | depended on by it into an idle state in which they will draw less power. | ||
26 | |||
27 | However, there may be multiple different idle states that can be used in such a | ||
28 | situation in principle, so it may be necessary to find the most suitable one | ||
29 | (from the kernel perspective) and ask the processor to use (or "enter") that | ||
30 | particular idle state. That is the role of the CPU idle time management | ||
31 | subsystem in the kernel, called ``CPUIdle``. | ||
32 | |||
33 | The design of ``CPUIdle`` is modular and based on the code duplication avoidance | ||
34 | principle, so the generic code that in principle need not depend on the hardware | ||
35 | or platform design details in it is separate from the code that interacts with | ||
36 | the hardware. It generally is divided into three categories of functional | ||
37 | units: *governors* responsible for selecting idle states to ask the processor | ||
38 | to enter, *drivers* that pass the governors' decisions on to the hardware and | ||
39 | the *core* providing a common framework for them. | ||
40 | |||
41 | |||
42 | CPU Idle Time Governors | ||
43 | ======================= | ||
44 | |||
45 | A CPU idle time (``CPUIdle``) governor is a bundle of policy code invoked when | ||
46 | one of the logical CPUs in the system turns out to be idle. Its role is to | ||
47 | select an idle state to ask the processor to enter in order to save some energy. | ||
48 | |||
49 | ``CPUIdle`` governors are generic and each of them can be used on any hardware | ||
50 | platform that the Linux kernel can run on. For this reason, data structures | ||
51 | operated on by them cannot depend on any hardware architecture or platform | ||
52 | design details as well. | ||
53 | |||
54 | The governor itself is represented by a |struct cpuidle_governor| object | ||
55 | containing four callback pointers, :c:member:`enable`, :c:member:`disable`, | ||
56 | :c:member:`select`, :c:member:`reflect`, a :c:member:`rating` field described | ||
57 | below, and a name (string) used for identifying it. | ||
58 | |||
59 | For the governor to be available at all, that object needs to be registered | ||
60 | with the ``CPUIdle`` core by calling :c:func:`cpuidle_register_governor()` with | ||
61 | a pointer to it passed as the argument. If successful, that causes the core to | ||
62 | add the governor to the global list of available governors and, if it is the | ||
63 | only one in the list (that is, the list was empty before) or the value of its | ||
64 | :c:member:`rating` field is greater than the value of that field for the | ||
65 | governor currently in use, or the name of the new governor was passed to the | ||
66 | kernel as the value of the ``cpuidle.governor=`` command line parameter, the new | ||
67 | governor will be used from that point on (there can be only one ``CPUIdle`` | ||
68 | governor in use at a time). Also, if ``cpuidle_sysfs_switch`` is passed to the | ||
69 | kernel in the command line, user space can choose the ``CPUIdle`` governor to | ||
70 | use at run time via ``sysfs``. | ||
71 | |||
72 | Once registered, ``CPUIdle`` governors cannot be unregistered, so it is not | ||
73 | practical to put them into loadable kernel modules. | ||
74 | |||
75 | The interface between ``CPUIdle`` governors and the core consists of four | ||
76 | callbacks: | ||
77 | |||
78 | :c:member:`enable` | ||
79 | :: | ||
80 | |||
81 | int (*enable) (struct cpuidle_driver *drv, struct cpuidle_device *dev); | ||
82 | |||
83 | The role of this callback is to prepare the governor for handling the | ||
84 | (logical) CPU represented by the |struct cpuidle_device| object pointed | ||
85 | to by the ``dev`` argument. The |struct cpuidle_driver| object pointed | ||
86 | to by the ``drv`` argument represents the ``CPUIdle`` driver to be used | ||
87 | with that CPU (among other things, it should contain the list of | ||
88 | |struct cpuidle_state| objects representing idle states that the | ||
89 | processor holding the given CPU can be asked to enter). | ||
90 | |||
91 | It may fail, in which case it is expected to return a negative error | ||
92 | code, and that causes the kernel to run the architecture-specific | ||
93 | default code for idle CPUs on the CPU in question instead of ``CPUIdle`` | ||
94 | until the ``->enable()`` governor callback is invoked for that CPU | ||
95 | again. | ||
96 | |||
97 | :c:member:`disable` | ||
98 | :: | ||
99 | |||
100 | void (*disable) (struct cpuidle_driver *drv, struct cpuidle_device *dev); | ||
101 | |||
102 | Called to make the governor stop handling the (logical) CPU represented | ||
103 | by the |struct cpuidle_device| object pointed to by the ``dev`` | ||
104 | argument. | ||
105 | |||
106 | It is expected to reverse any changes made by the ``->enable()`` | ||
107 | callback when it was last invoked for the target CPU, free all memory | ||
108 | allocated by that callback and so on. | ||
109 | |||
110 | :c:member:`select` | ||
111 | :: | ||
112 | |||
113 | int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev, | ||
114 | bool *stop_tick); | ||
115 | |||
116 | Called to select an idle state for the processor holding the (logical) | ||
117 | CPU represented by the |struct cpuidle_device| object pointed to by the | ||
118 | ``dev`` argument. | ||
119 | |||
120 | The list of idle states to take into consideration is represented by the | ||
121 | :c:member:`states` array of |struct cpuidle_state| objects held by the | ||
122 | |struct cpuidle_driver| object pointed to by the ``drv`` argument (which | ||
123 | represents the ``CPUIdle`` driver to be used with the CPU at hand). The | ||
124 | value returned by this callback is interpreted as an index into that | ||
125 | array (unless it is a negative error code). | ||
126 | |||
127 | The ``stop_tick`` argument is used to indicate whether or not to stop | ||
128 | the scheduler tick before asking the processor to enter the selected | ||
129 | idle state. When the ``bool`` variable pointed to by it (which is set | ||
130 | to ``true`` before invoking this callback) is cleared to ``false``, the | ||
131 | processor will be asked to enter the selected idle state without | ||
132 | stopping the scheduler tick on the given CPU (if the tick has been | ||
133 | stopped on that CPU already, however, it will not be restarted before | ||
134 | asking the processor to enter the idle state). | ||
135 | |||
136 | This callback is mandatory (i.e. the :c:member:`select` callback pointer | ||
137 | in |struct cpuidle_governor| must not be ``NULL`` for the registration | ||
138 | of the governor to succeed). | ||
139 | |||
140 | :c:member:`reflect` | ||
141 | :: | ||
142 | |||
143 | void (*reflect) (struct cpuidle_device *dev, int index); | ||
144 | |||
145 | Called to allow the governor to evaluate the accuracy of the idle state | ||
146 | selection made by the ``->select()`` callback (when it was invoked last | ||
147 | time) and possibly use the result of that to improve the accuracy of | ||
148 | idle state selections in the future. | ||
149 | |||
150 | In addition, ``CPUIdle`` governors are required to take power management | ||
151 | quality of service (PM QoS) constraints on the processor wakeup latency into | ||
152 | account when selecting idle states. In order to obtain the current effective | ||
153 | PM QoS wakeup latency constraint for a given CPU, a ``CPUIdle`` governor is | ||
154 | expected to pass the number of the CPU to | ||
155 | :c:func:`cpuidle_governor_latency_req()`. Then, the governor's ``->select()`` | ||
156 | callback must not return the index of an indle state whose | ||
157 | :c:member:`exit_latency` value is greater than the number returned by that | ||
158 | function. | ||
159 | |||
160 | |||
161 | CPU Idle Time Management Drivers | ||
162 | ================================ | ||
163 | |||
164 | CPU idle time management (``CPUIdle``) drivers provide an interface between the | ||
165 | other parts of ``CPUIdle`` and the hardware. | ||
166 | |||
167 | First of all, a ``CPUIdle`` driver has to populate the :c:member:`states` array | ||
168 | of |struct cpuidle_state| objects included in the |struct cpuidle_driver| object | ||
169 | representing it. Going forward this array will represent the list of available | ||
170 | idle states that the processor hardware can be asked to enter shared by all of | ||
171 | the logical CPUs handled by the given driver. | ||
172 | |||
173 | The entries in the :c:member:`states` array are expected to be sorted by the | ||
174 | value of the :c:member:`target_residency` field in |struct cpuidle_state| in | ||
175 | the ascending order (that is, index 0 should correspond to the idle state with | ||
176 | the minimum value of :c:member:`target_residency`). [Since the | ||
177 | :c:member:`target_residency` value is expected to reflect the "depth" of the | ||
178 | idle state represented by the |struct cpuidle_state| object holding it, this | ||
179 | sorting order should be the same as the ascending sorting order by the idle | ||
180 | state "depth".] | ||
181 | |||
182 | Three fields in |struct cpuidle_state| are used by the existing ``CPUIdle`` | ||
183 | governors for computations related to idle state selection: | ||
184 | |||
185 | :c:member:`target_residency` | ||
186 | Minimum time to spend in this idle state including the time needed to | ||
187 | enter it (which may be substantial) to save more energy than could | ||
188 | be saved by staying in a shallower idle state for the same amount of | ||
189 | time, in microseconds. | ||
190 | |||
191 | :c:member:`exit_latency` | ||
192 | Maximum time it will take a CPU asking the processor to enter this idle | ||
193 | state to start executing the first instruction after a wakeup from it, | ||
194 | in microseconds. | ||
195 | |||
196 | :c:member:`flags` | ||
197 | Flags representing idle state properties. Currently, governors only use | ||
198 | the ``CPUIDLE_FLAG_POLLING`` flag which is set if the given object | ||
199 | does not represent a real idle state, but an interface to a software | ||
200 | "loop" that can be used in order to avoid asking the processor to enter | ||
201 | any idle state at all. [There are other flags used by the ``CPUIdle`` | ||
202 | core in special situations.] | ||
203 | |||
204 | The :c:member:`enter` callback pointer in |struct cpuidle_state|, which must not | ||
205 | be ``NULL``, points to the routine to execute in order to ask the processor to | ||
206 | enter this particular idle state: | ||
207 | |||
208 | :: | ||
209 | |||
210 | void (*enter) (struct cpuidle_device *dev, struct cpuidle_driver *drv, | ||
211 | int index); | ||
212 | |||
213 | The first two arguments of it point to the |struct cpuidle_device| object | ||
214 | representing the logical CPU running this callback and the | ||
215 | |struct cpuidle_driver| object representing the driver itself, respectively, | ||
216 | and the last one is an index of the |struct cpuidle_state| entry in the driver's | ||
217 | :c:member:`states` array representing the idle state to ask the processor to | ||
218 | enter. | ||
219 | |||
220 | The analogous ``->enter_s2idle()`` callback in |struct cpuidle_state| is used | ||
221 | only for implementing the suspend-to-idle system-wide power management feature. | ||
222 | The difference between in and ``->enter()`` is that it must not re-enable | ||
223 | interrupts at any point (even temporarily) or attempt to change the states of | ||
224 | clock event devices, which the ``->enter()`` callback may do sometimes. | ||
225 | |||
226 | Once the :c:member:`states` array has been populated, the number of valid | ||
227 | entries in it has to be stored in the :c:member:`state_count` field of the | ||
228 | |struct cpuidle_driver| object representing the driver. Moreover, if any | ||
229 | entries in the :c:member:`states` array represent "coupled" idle states (that | ||
230 | is, idle states that can only be asked for if multiple related logical CPUs are | ||
231 | idle), the :c:member:`safe_state_index` field in |struct cpuidle_driver| needs | ||
232 | to be the index of an idle state that is not "coupled" (that is, one that can be | ||
233 | asked for if only one logical CPU is idle). | ||
234 | |||
235 | In addition to that, if the given ``CPUIdle`` driver is only going to handle a | ||
236 | subset of logical CPUs in the system, the :c:member:`cpumask` field in its | ||
237 | |struct cpuidle_driver| object must point to the set (mask) of CPUs that will be | ||
238 | handled by it. | ||
239 | |||
240 | A ``CPUIdle`` driver can only be used after it has been registered. If there | ||
241 | are no "coupled" idle state entries in the driver's :c:member:`states` array, | ||
242 | that can be accomplished by passing the driver's |struct cpuidle_driver| object | ||
243 | to :c:func:`cpuidle_register_driver()`. Otherwise, :c:func:`cpuidle_register()` | ||
244 | should be used for this purpose. | ||
245 | |||
246 | However, it also is necessary to register |struct cpuidle_device| objects for | ||
247 | all of the logical CPUs to be handled by the given ``CPUIdle`` driver with the | ||
248 | help of :c:func:`cpuidle_register_device()` after the driver has been registered | ||
249 | and :c:func:`cpuidle_register_driver()`, unlike :c:func:`cpuidle_register()`, | ||
250 | does not do that automatically. For this reason, the drivers that use | ||
251 | :c:func:`cpuidle_register_driver()` to register themselves must also take care | ||
252 | of registering the |struct cpuidle_device| objects as needed, so it is generally | ||
253 | recommended to use :c:func:`cpuidle_register()` for ``CPUIdle`` driver | ||
254 | registration in all cases. | ||
255 | |||
256 | The registration of a |struct cpuidle_device| object causes the ``CPUIdle`` | ||
257 | ``sysfs`` interface to be created and the governor's ``->enable()`` callback to | ||
258 | be invoked for the logical CPU represented by it, so it must take place after | ||
259 | registering the driver that will handle the CPU in question. | ||
260 | |||
261 | ``CPUIdle`` drivers and |struct cpuidle_device| objects can be unregistered | ||
262 | when they are not necessary any more which allows some resources associated with | ||
263 | them to be released. Due to dependencies between them, all of the | ||
264 | |struct cpuidle_device| objects representing CPUs handled by the given | ||
265 | ``CPUIdle`` driver must be unregistered, with the help of | ||
266 | :c:func:`cpuidle_unregister_device()`, before calling | ||
267 | :c:func:`cpuidle_unregister_driver()` to unregister the driver. Alternatively, | ||
268 | :c:func:`cpuidle_unregister()` can be called to unregister a ``CPUIdle`` driver | ||
269 | along with all of the |struct cpuidle_device| objects representing CPUs handled | ||
270 | by it. | ||
271 | |||
272 | ``CPUIdle`` drivers can respond to runtime system configuration changes that | ||
273 | lead to modifications of the list of available processor idle states (which can | ||
274 | happen, for example, when the system's power source is switched from AC to | ||
275 | battery or the other way around). Upon a notification of such a change, | ||
276 | a ``CPUIdle`` driver is expected to call :c:func:`cpuidle_pause_and_lock()` to | ||
277 | turn ``CPUIdle`` off temporarily and then :c:func:`cpuidle_disable_device()` for | ||
278 | all of the |struct cpuidle_device| objects representing CPUs affected by that | ||
279 | change. Next, it can update its :c:member:`states` array in accordance with | ||
280 | the new configuration of the system, call :c:func:`cpuidle_enable_device()` for | ||
281 | all of the relevant |struct cpuidle_device| objects and invoke | ||
282 | :c:func:`cpuidle_resume_and_unlock()` to allow ``CPUIdle`` to be used again. | ||
diff --git a/Documentation/driver-api/pm/index.rst b/Documentation/driver-api/pm/index.rst index 2f6d0e9cf6b7..56975c6bc789 100644 --- a/Documentation/driver-api/pm/index.rst +++ b/Documentation/driver-api/pm/index.rst | |||
@@ -1,9 +1,10 @@ | |||
1 | ======================= | 1 | =============================== |
2 | Device Power Management | 2 | CPU and Device Power Management |
3 | ======================= | 3 | =============================== |
4 | 4 | ||
5 | .. toctree:: | 5 | .. toctree:: |
6 | 6 | ||
7 | cpuidle | ||
7 | devices | 8 | devices |
8 | notifiers | 9 | notifiers |
9 | types | 10 | types |
diff --git a/MAINTAINERS b/MAINTAINERS index 9f64f8d3740e..b053a355894e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -4016,6 +4016,7 @@ S: Maintained | |||
4016 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git | 4016 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git |
4017 | B: https://bugzilla.kernel.org | 4017 | B: https://bugzilla.kernel.org |
4018 | F: Documentation/admin-guide/pm/cpuidle.rst | 4018 | F: Documentation/admin-guide/pm/cpuidle.rst |
4019 | F: Documentation/driver-api/pm/cpuidle.rst | ||
4019 | F: drivers/cpuidle/* | 4020 | F: drivers/cpuidle/* |
4020 | F: include/linux/cpuidle.h | 4021 | F: include/linux/cpuidle.h |
4021 | 4022 | ||
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf0a7..8caccbbd7353 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig | |||
@@ -4,7 +4,7 @@ config CPU_IDLE | |||
4 | bool "CPU idle PM support" | 4 | bool "CPU idle PM support" |
5 | default y if ACPI || PPC_PSERIES | 5 | default y if ACPI || PPC_PSERIES |
6 | select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) | 6 | select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) |
7 | select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) | 7 | select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO |
8 | help | 8 | help |
9 | CPU idle is a generic framework for supporting software-controlled | 9 | CPU idle is a generic framework for supporting software-controlled |
10 | idle processor power management. It includes modular cross-platform | 10 | idle processor power management. It includes modular cross-platform |
@@ -23,6 +23,15 @@ config CPU_IDLE_GOV_LADDER | |||
23 | config CPU_IDLE_GOV_MENU | 23 | config CPU_IDLE_GOV_MENU |
24 | bool "Menu governor (for tickless system)" | 24 | bool "Menu governor (for tickless system)" |
25 | 25 | ||
26 | config CPU_IDLE_GOV_TEO | ||
27 | bool "Timer events oriented (TEO) governor (for tickless systems)" | ||
28 | help | ||
29 | This governor implements a simplified idle state selection method | ||
30 | focused on timer events and does not do any interactivity boosting. | ||
31 | |||
32 | Some workloads benefit from using it and it generally should be safe | ||
33 | to use. Say Y here if you are not happy with the alternatives. | ||
34 | |||
26 | config DT_IDLE_STATES | 35 | config DT_IDLE_STATES |
27 | bool | 36 | bool |
28 | 37 | ||
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 1b512722689f..4d8aff5248a8 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile | |||
@@ -4,3 +4,4 @@ | |||
4 | 4 | ||
5 | obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o | 5 | obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o |
6 | obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o | 6 | obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o |
7 | obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o | ||
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c new file mode 100644 index 000000000000..7d05efdbd3c6 --- /dev/null +++ b/drivers/cpuidle/governors/teo.c | |||
@@ -0,0 +1,444 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Timer events oriented CPU idle governor | ||
4 | * | ||
5 | * Copyright (C) 2018 Intel Corporation | ||
6 | * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> | ||
7 | * | ||
8 | * The idea of this governor is based on the observation that on many systems | ||
9 | * timer events are two or more orders of magnitude more frequent than any | ||
10 | * other interrupts, so they are likely to be the most significant source of CPU | ||
11 | * wakeups from idle states. Moreover, information about what happened in the | ||
12 | * (relatively recent) past can be used to estimate whether or not the deepest | ||
13 | * idle state with target residency within the time to the closest timer is | ||
14 | * likely to be suitable for the upcoming idle time of the CPU and, if not, then | ||
15 | * which of the shallower idle states to choose. | ||
16 | * | ||
17 | * Of course, non-timer wakeup sources are more important in some use cases and | ||
18 | * they can be covered by taking a few most recent idle time intervals of the | ||
19 | * CPU into account. However, even in that case it is not necessary to consider | ||
20 | * idle duration values greater than the time till the closest timer, as the | ||
21 | * patterns that they may belong to produce average values close enough to | ||
22 | * the time till the closest timer (sleep length) anyway. | ||
23 | * | ||
24 | * Thus this governor estimates whether or not the upcoming idle time of the CPU | ||
25 | * is likely to be significantly shorter than the sleep length and selects an | ||
26 | * idle state for it in accordance with that, as follows: | ||
27 | * | ||
28 | * - Find an idle state on the basis of the sleep length and state statistics | ||
29 | * collected over time: | ||
30 | * | ||
31 | * o Find the deepest idle state whose target residency is less than or equal | ||
32 | * to the sleep length. | ||
33 | * | ||
34 | * o Select it if it matched both the sleep length and the observed idle | ||
35 | * duration in the past more often than it matched the sleep length alone | ||
36 | * (i.e. the observed idle duration was significantly shorter than the sleep | ||
37 | * length matched by it). | ||
38 | * | ||
39 | * o Otherwise, select the shallower state with the greatest matched "early" | ||
40 | * wakeups metric. | ||
41 | * | ||
42 | * - If the majority of the most recent idle duration values are below the | ||
43 | * target residency of the idle state selected so far, use those values to | ||
44 | * compute the new expected idle duration and find an idle state matching it | ||
45 | * (which has to be shallower than the one selected so far). | ||
46 | */ | ||
47 | |||
48 | #include <linux/cpuidle.h> | ||
49 | #include <linux/jiffies.h> | ||
50 | #include <linux/kernel.h> | ||
51 | #include <linux/sched/clock.h> | ||
52 | #include <linux/tick.h> | ||
53 | |||
54 | /* | ||
55 | * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value | ||
56 | * is used for decreasing metrics on a regular basis. | ||
57 | */ | ||
58 | #define PULSE 1024 | ||
59 | #define DECAY_SHIFT 3 | ||
60 | |||
61 | /* | ||
62 | * Number of the most recent idle duration values to take into consideration for | ||
63 | * the detection of wakeup patterns. | ||
64 | */ | ||
65 | #define INTERVALS 8 | ||
66 | |||
67 | /** | ||
68 | * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. | ||
69 | * @early_hits: "Early" CPU wakeups "matching" this state. | ||
70 | * @hits: "On time" CPU wakeups "matching" this state. | ||
71 | * @misses: CPU wakeups "missing" this state. | ||
72 | * | ||
73 | * A CPU wakeup is "matched" by a given idle state if the idle duration measured | ||
74 | * after the wakeup is between the target residency of that state and the target | ||
75 | * residency of the next one (or if this is the deepest available idle state, it | ||
76 | * "matches" a CPU wakeup when the measured idle duration is at least equal to | ||
77 | * its target residency). | ||
78 | * | ||
79 | * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if | ||
80 | * it occurs significantly earlier than the closest expected timer event (that | ||
81 | * is, early enough to match an idle state shallower than the one matching the | ||
82 | * time till the closest timer event). Otherwise, the wakeup is "on time", or | ||
83 | * it is a "hit". | ||
84 | * | ||
85 | * A "miss" occurs when the given state doesn't match the wakeup, but it matches | ||
86 | * the time till the closest timer event used for idle state selection. | ||
87 | */ | ||
88 | struct teo_idle_state { | ||
89 | unsigned int early_hits; | ||
90 | unsigned int hits; | ||
91 | unsigned int misses; | ||
92 | }; | ||
93 | |||
94 | /** | ||
95 | * struct teo_cpu - CPU data used by the TEO cpuidle governor. | ||
96 | * @time_span_ns: Time between idle state selection and post-wakeup update. | ||
97 | * @sleep_length_ns: Time till the closest timer event (at the selection time). | ||
98 | * @states: Idle states data corresponding to this CPU. | ||
99 | * @last_state: Idle state entered by the CPU last time. | ||
100 | * @interval_idx: Index of the most recent saved idle interval. | ||
101 | * @intervals: Saved idle duration values. | ||
102 | */ | ||
103 | struct teo_cpu { | ||
104 | u64 time_span_ns; | ||
105 | u64 sleep_length_ns; | ||
106 | struct teo_idle_state states[CPUIDLE_STATE_MAX]; | ||
107 | int last_state; | ||
108 | int interval_idx; | ||
109 | unsigned int intervals[INTERVALS]; | ||
110 | }; | ||
111 | |||
112 | static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); | ||
113 | |||
114 | /** | ||
115 | * teo_update - Update CPU data after wakeup. | ||
116 | * @drv: cpuidle driver containing state data. | ||
117 | * @dev: Target CPU. | ||
118 | */ | ||
119 | static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) | ||
120 | { | ||
121 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
122 | unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); | ||
123 | int i, idx_hit = -1, idx_timer = -1; | ||
124 | unsigned int measured_us; | ||
125 | |||
126 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { | ||
127 | /* | ||
128 | * One of the safety nets has triggered or this was a timer | ||
129 | * wakeup (or equivalent). | ||
130 | */ | ||
131 | measured_us = sleep_length_us; | ||
132 | } else { | ||
133 | unsigned int lat = drv->states[cpu_data->last_state].exit_latency; | ||
134 | |||
135 | measured_us = ktime_to_us(cpu_data->time_span_ns); | ||
136 | /* | ||
137 | * The delay between the wakeup and the first instruction | ||
138 | * executed by the CPU is not likely to be worst-case every | ||
139 | * time, so take 1/2 of the exit latency as a very rough | ||
140 | * approximation of the average of it. | ||
141 | */ | ||
142 | if (measured_us >= lat) | ||
143 | measured_us -= lat / 2; | ||
144 | else | ||
145 | measured_us /= 2; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Decay the "early hits" metric for all of the states and find the | ||
150 | * states matching the sleep length and the measured idle duration. | ||
151 | */ | ||
152 | for (i = 0; i < drv->state_count; i++) { | ||
153 | unsigned int early_hits = cpu_data->states[i].early_hits; | ||
154 | |||
155 | cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; | ||
156 | |||
157 | if (drv->states[i].target_residency <= sleep_length_us) { | ||
158 | idx_timer = i; | ||
159 | if (drv->states[i].target_residency <= measured_us) | ||
160 | idx_hit = i; | ||
161 | } | ||
162 | } | ||
163 | |||
164 | /* | ||
165 | * Update the "hits" and "misses" data for the state matching the sleep | ||
166 | * length. If it matches the measured idle duration too, this is a hit, | ||
167 | * so increase the "hits" metric for it then. Otherwise, this is a | ||
168 | * miss, so increase the "misses" metric for it. In the latter case | ||
169 | * also increase the "early hits" metric for the state that actually | ||
170 | * matches the measured idle duration. | ||
171 | */ | ||
172 | if (idx_timer >= 0) { | ||
173 | unsigned int hits = cpu_data->states[idx_timer].hits; | ||
174 | unsigned int misses = cpu_data->states[idx_timer].misses; | ||
175 | |||
176 | hits -= hits >> DECAY_SHIFT; | ||
177 | misses -= misses >> DECAY_SHIFT; | ||
178 | |||
179 | if (idx_timer > idx_hit) { | ||
180 | misses += PULSE; | ||
181 | if (idx_hit >= 0) | ||
182 | cpu_data->states[idx_hit].early_hits += PULSE; | ||
183 | } else { | ||
184 | hits += PULSE; | ||
185 | } | ||
186 | |||
187 | cpu_data->states[idx_timer].misses = misses; | ||
188 | cpu_data->states[idx_timer].hits = hits; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * If the total time span between idle state selection and the "reflect" | ||
193 | * callback is greater than or equal to the sleep length determined at | ||
194 | * the idle state selection time, the wakeup is likely to be due to a | ||
195 | * timer event. | ||
196 | */ | ||
197 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) | ||
198 | measured_us = UINT_MAX; | ||
199 | |||
200 | /* | ||
201 | * Save idle duration values corresponding to non-timer wakeups for | ||
202 | * pattern detection. | ||
203 | */ | ||
204 | cpu_data->intervals[cpu_data->interval_idx++] = measured_us; | ||
205 | if (cpu_data->interval_idx > INTERVALS) | ||
206 | cpu_data->interval_idx = 0; | ||
207 | } | ||
208 | |||
209 | /** | ||
210 | * teo_find_shallower_state - Find shallower idle state matching given duration. | ||
211 | * @drv: cpuidle driver containing state data. | ||
212 | * @dev: Target CPU. | ||
213 | * @state_idx: Index of the capping idle state. | ||
214 | * @duration_us: Idle duration value to match. | ||
215 | */ | ||
216 | static int teo_find_shallower_state(struct cpuidle_driver *drv, | ||
217 | struct cpuidle_device *dev, int state_idx, | ||
218 | unsigned int duration_us) | ||
219 | { | ||
220 | int i; | ||
221 | |||
222 | for (i = state_idx - 1; i >= 0; i--) { | ||
223 | if (drv->states[i].disabled || dev->states_usage[i].disable) | ||
224 | continue; | ||
225 | |||
226 | state_idx = i; | ||
227 | if (drv->states[i].target_residency <= duration_us) | ||
228 | break; | ||
229 | } | ||
230 | return state_idx; | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * teo_select - Selects the next idle state to enter. | ||
235 | * @drv: cpuidle driver containing state data. | ||
236 | * @dev: Target CPU. | ||
237 | * @stop_tick: Indication on whether or not to stop the scheduler tick. | ||
238 | */ | ||
239 | static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, | ||
240 | bool *stop_tick) | ||
241 | { | ||
242 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
243 | int latency_req = cpuidle_governor_latency_req(dev->cpu); | ||
244 | unsigned int duration_us, count; | ||
245 | int max_early_idx, idx, i; | ||
246 | ktime_t delta_tick; | ||
247 | |||
248 | if (cpu_data->last_state >= 0) { | ||
249 | teo_update(drv, dev); | ||
250 | cpu_data->last_state = -1; | ||
251 | } | ||
252 | |||
253 | cpu_data->time_span_ns = local_clock(); | ||
254 | |||
255 | cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); | ||
256 | duration_us = ktime_to_us(cpu_data->sleep_length_ns); | ||
257 | |||
258 | count = 0; | ||
259 | max_early_idx = -1; | ||
260 | idx = -1; | ||
261 | |||
262 | for (i = 0; i < drv->state_count; i++) { | ||
263 | struct cpuidle_state *s = &drv->states[i]; | ||
264 | struct cpuidle_state_usage *su = &dev->states_usage[i]; | ||
265 | |||
266 | if (s->disabled || su->disable) { | ||
267 | /* | ||
268 | * If the "early hits" metric of a disabled state is | ||
269 | * greater than the current maximum, it should be taken | ||
270 | * into account, because it would be a mistake to select | ||
271 | * a deeper state with lower "early hits" metric. The | ||
272 | * index cannot be changed to point to it, however, so | ||
273 | * just increase the max count alone and let the index | ||
274 | * still point to a shallower idle state. | ||
275 | */ | ||
276 | if (max_early_idx >= 0 && | ||
277 | count < cpu_data->states[i].early_hits) | ||
278 | count = cpu_data->states[i].early_hits; | ||
279 | |||
280 | continue; | ||
281 | } | ||
282 | |||
283 | if (idx < 0) | ||
284 | idx = i; /* first enabled state */ | ||
285 | |||
286 | if (s->target_residency > duration_us) | ||
287 | break; | ||
288 | |||
289 | if (s->exit_latency > latency_req) { | ||
290 | /* | ||
291 | * If we break out of the loop for latency reasons, use | ||
292 | * the target residency of the selected state as the | ||
293 | * expected idle duration to avoid stopping the tick | ||
294 | * as long as that target residency is low enough. | ||
295 | */ | ||
296 | duration_us = drv->states[idx].target_residency; | ||
297 | goto refine; | ||
298 | } | ||
299 | |||
300 | idx = i; | ||
301 | |||
302 | if (count < cpu_data->states[i].early_hits && | ||
303 | !(tick_nohz_tick_stopped() && | ||
304 | drv->states[i].target_residency < TICK_USEC)) { | ||
305 | count = cpu_data->states[i].early_hits; | ||
306 | max_early_idx = i; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * If the "hits" metric of the idle state matching the sleep length is | ||
312 | * greater than its "misses" metric, that is the one to use. Otherwise, | ||
313 | * it is more likely that one of the shallower states will match the | ||
314 | * idle duration observed after wakeup, so take the one with the maximum | ||
315 | * "early hits" metric, but if that cannot be determined, just use the | ||
316 | * state selected so far. | ||
317 | */ | ||
318 | if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses && | ||
319 | max_early_idx >= 0) { | ||
320 | idx = max_early_idx; | ||
321 | duration_us = drv->states[idx].target_residency; | ||
322 | } | ||
323 | |||
324 | refine: | ||
325 | if (idx < 0) { | ||
326 | idx = 0; /* No states enabled. Must use 0. */ | ||
327 | } else if (idx > 0) { | ||
328 | u64 sum = 0; | ||
329 | |||
330 | count = 0; | ||
331 | |||
332 | /* | ||
333 | * Count and sum the most recent idle duration values less than | ||
334 | * the target residency of the state selected so far, find the | ||
335 | * max. | ||
336 | */ | ||
337 | for (i = 0; i < INTERVALS; i++) { | ||
338 | unsigned int val = cpu_data->intervals[i]; | ||
339 | |||
340 | if (val >= drv->states[idx].target_residency) | ||
341 | continue; | ||
342 | |||
343 | count++; | ||
344 | sum += val; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * Give up unless the majority of the most recent idle duration | ||
349 | * values are in the interesting range. | ||
350 | */ | ||
351 | if (count > INTERVALS / 2) { | ||
352 | unsigned int avg_us = div64_u64(sum, count); | ||
353 | |||
354 | /* | ||
355 | * Avoid spending too much time in an idle state that | ||
356 | * would be too shallow. | ||
357 | */ | ||
358 | if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { | ||
359 | idx = teo_find_shallower_state(drv, dev, idx, avg_us); | ||
360 | duration_us = avg_us; | ||
361 | } | ||
362 | } | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * Don't stop the tick if the selected state is a polling one or if the | ||
367 | * expected idle duration is shorter than the tick period length. | ||
368 | */ | ||
369 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || | ||
370 | duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { | ||
371 | unsigned int delta_tick_us = ktime_to_us(delta_tick); | ||
372 | |||
373 | *stop_tick = false; | ||
374 | |||
375 | /* | ||
376 | * The tick is not going to be stopped, so if the target | ||
377 | * residency of the state to be returned is not within the time | ||
378 | * till the closest timer including the tick, try to correct | ||
379 | * that. | ||
380 | */ | ||
381 | if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) | ||
382 | idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); | ||
383 | } | ||
384 | |||
385 | return idx; | ||
386 | } | ||
387 | |||
388 | /** | ||
389 | * teo_reflect - Note that governor data for the CPU need to be updated. | ||
390 | * @dev: Target CPU. | ||
391 | * @state: Entered state. | ||
392 | */ | ||
393 | static void teo_reflect(struct cpuidle_device *dev, int state) | ||
394 | { | ||
395 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
396 | |||
397 | cpu_data->last_state = state; | ||
398 | /* | ||
399 | * If the wakeup was not "natural", but triggered by one of the safety | ||
400 | * nets, assume that the CPU might have been idle for the entire sleep | ||
401 | * length time. | ||
402 | */ | ||
403 | if (dev->poll_time_limit || | ||
404 | (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { | ||
405 | dev->poll_time_limit = false; | ||
406 | cpu_data->time_span_ns = cpu_data->sleep_length_ns; | ||
407 | } else { | ||
408 | cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /** | ||
413 | * teo_enable_device - Initialize the governor's data for the target CPU. | ||
414 | * @drv: cpuidle driver (not used). | ||
415 | * @dev: Target CPU. | ||
416 | */ | ||
417 | static int teo_enable_device(struct cpuidle_driver *drv, | ||
418 | struct cpuidle_device *dev) | ||
419 | { | ||
420 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
421 | int i; | ||
422 | |||
423 | memset(cpu_data, 0, sizeof(*cpu_data)); | ||
424 | |||
425 | for (i = 0; i < INTERVALS; i++) | ||
426 | cpu_data->intervals[i] = UINT_MAX; | ||
427 | |||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | static struct cpuidle_governor teo_governor = { | ||
432 | .name = "teo", | ||
433 | .rating = 19, | ||
434 | .enable = teo_enable_device, | ||
435 | .select = teo_select, | ||
436 | .reflect = teo_reflect, | ||
437 | }; | ||
438 | |||
439 | static int __init teo_governor_init(void) | ||
440 | { | ||
441 | return cpuidle_register_governor(&teo_governor); | ||
442 | } | ||
443 | |||
444 | postcore_initcall(teo_governor_init); | ||
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4dff74f48d4b..3b39472324a3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h | |||
@@ -69,11 +69,9 @@ struct cpuidle_state { | |||
69 | 69 | ||
70 | /* Idle State Flags */ | 70 | /* Idle State Flags */ |
71 | #define CPUIDLE_FLAG_NONE (0x00) | 71 | #define CPUIDLE_FLAG_NONE (0x00) |
72 | #define CPUIDLE_FLAG_POLLING (0x01) /* polling state */ | 72 | #define CPUIDLE_FLAG_POLLING BIT(0) /* polling state */ |
73 | #define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ | 73 | #define CPUIDLE_FLAG_COUPLED BIT(1) /* state applies to multiple cpus */ |
74 | #define CPUIDLE_FLAG_TIMER_STOP (0x04) /* timer is stopped on this state */ | 74 | #define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */ |
75 | |||
76 | #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) | ||
77 | 75 | ||
78 | struct cpuidle_device_kobj; | 76 | struct cpuidle_device_kobj; |
79 | struct cpuidle_state_kobj; | 77 | struct cpuidle_state_kobj; |