diff options
| -rw-r--r-- | Documentation/admin-guide/pm/cpuidle.rst | 104 | ||||
| -rw-r--r-- | Documentation/cpuidle/driver.txt | 37 | ||||
| -rw-r--r-- | Documentation/cpuidle/governor.txt | 28 | ||||
| -rw-r--r-- | Documentation/driver-api/pm/cpuidle.rst | 282 | ||||
| -rw-r--r-- | Documentation/driver-api/pm/index.rst | 7 | ||||
| -rw-r--r-- | MAINTAINERS | 1 | ||||
| -rw-r--r-- | drivers/cpuidle/Kconfig | 11 | ||||
| -rw-r--r-- | drivers/cpuidle/governors/Makefile | 1 | ||||
| -rw-r--r-- | drivers/cpuidle/governors/teo.c | 444 | ||||
| -rw-r--r-- | include/linux/cpuidle.h | 8 |
10 files changed, 841 insertions, 82 deletions
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 106379e2619f..9c58b35a81cb 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst | |||
| @@ -155,14 +155,14 @@ governor uses that information depends on what algorithm is implemented by it | |||
| 155 | and that is the primary reason for having more than one governor in the | 155 | and that is the primary reason for having more than one governor in the |
| 156 | ``CPUIdle`` subsystem. | 156 | ``CPUIdle`` subsystem. |
| 157 | 157 | ||
| 158 | There are two ``CPUIdle`` governors available, ``menu`` and ``ladder``. Which | 158 | There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_ |
| 159 | of them is used depends on the configuration of the kernel and in particular on | 159 | and ``ladder``. Which of them is used by default depends on the configuration |
| 160 | whether or not the scheduler tick can be `stopped by the idle | 160 | of the kernel and in particular on whether or not the scheduler tick can be |
| 161 | loop <idle-cpus-and-tick_>`_. It is possible to change the governor at run time | 161 | `stopped by the idle loop <idle-cpus-and-tick_>`_. It is possible to change the |
| 162 | if the ``cpuidle_sysfs_switch`` command line parameter has been passed to the | 162 | governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has |
| 163 | kernel, but that is not safe in general, so it should not be done on production | 163 | been passed to the kernel, but that is not safe in general, so it should not be |
| 164 | systems (that may change in the future, though). The name of the ``CPUIdle`` | 164 | done on production systems (that may change in the future, though). The name of |
| 165 | governor currently used by the kernel can be read from the | 165 | the ``CPUIdle`` governor currently used by the kernel can be read from the |
| 166 | :file:`current_governor_ro` (or :file:`current_governor` if | 166 | :file:`current_governor_ro` (or :file:`current_governor` if |
| 167 | ``cpuidle_sysfs_switch`` is present in the kernel command line) file under | 167 | ``cpuidle_sysfs_switch`` is present in the kernel command line) file under |
| 168 | :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``. | 168 | :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``. |
| @@ -256,6 +256,8 @@ the ``menu`` governor by default and if it is not tickless, the default | |||
| 256 | ``CPUIdle`` governor on it will be ``ladder``. | 256 | ``CPUIdle`` governor on it will be ``ladder``. |
| 257 | 257 | ||
| 258 | 258 | ||
| 259 | .. _menu-gov: | ||
| 260 | |||
| 259 | The ``menu`` Governor | 261 | The ``menu`` Governor |
| 260 | ===================== | 262 | ===================== |
| 261 | 263 | ||
| @@ -333,6 +335,92 @@ that time, the governor may need to select a shallower state with a suitable | |||
| 333 | target residency. | 335 | target residency. |
| 334 | 336 | ||
| 335 | 337 | ||
| 338 | .. _teo-gov: | ||
| 339 | |||
| 340 | The Timer Events Oriented (TEO) Governor | ||
| 341 | ======================================== | ||
| 342 | |||
| 343 | The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor | ||
| 344 | for tickless systems. It follows the same basic strategy as the ``menu`` `one | ||
| 345 | <menu-gov_>`_: it always tries to find the deepest idle state suitable for the | ||
| 346 | given conditions. However, it applies a different approach to that problem. | ||
| 347 | |||
| 348 | First, it does not use sleep length correction factors, but instead it attempts | ||
| 349 | to correlate the observed idle duration values with the available idle states | ||
| 350 | and use that information to pick up the idle state that is most likely to | ||
| 351 | "match" the upcoming CPU idle interval. Second, it does not take the tasks | ||
| 352 | that were running on the given CPU in the past and are waiting on some I/O | ||
| 353 | operations to complete now at all (there is no guarantee that they will run on | ||
| 354 | the same CPU when they become runnable again) and the pattern detection code in | ||
| 355 | it avoids taking timer wakeups into account. It also only uses idle duration | ||
| 356 | values less than the current time till the closest timer (with the scheduler | ||
| 357 | tick excluded) for that purpose. | ||
| 358 | |||
| 359 | Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain | ||
| 360 | the *sleep length*, which is the time until the closest timer event with the | ||
| 361 | assumption that the scheduler tick will be stopped (that also is the upper bound | ||
| 362 | on the time until the next CPU wakeup). That value is then used to preselect an | ||
| 363 | idle state on the basis of three metrics maintained for each idle state provided | ||
| 364 | by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``. | ||
| 365 | |||
| 366 | The ``hits`` and ``misses`` metrics measure the likelihood that a given idle | ||
| 367 | state will "match" the observed (post-wakeup) idle duration if it "matches" the | ||
| 368 | sleep length. They both are subject to decay (after a CPU wakeup) every time | ||
| 369 | the target residency of the idle state corresponding to them is less than or | ||
| 370 | equal to the sleep length and the target residency of the next idle state is | ||
| 371 | greater than the sleep length (that is, when the idle state corresponding to | ||
| 372 | them "matches" the sleep length). The ``hits`` metric is increased if the | ||
| 373 | former condition is satisfied and the target residency of the given idle state | ||
| 374 | is less than or equal to the observed idle duration and the target residency of | ||
| 375 | the next idle state is greater than the observed idle duration at the same time | ||
| 376 | (that is, it is increased when the given idle state "matches" both the sleep | ||
| 377 | length and the observed idle duration). In turn, the ``misses`` metric is | ||
| 378 | increased when the given idle state "matches" the sleep length only and the | ||
| 379 | observed idle duration is too short for its target residency. | ||
| 380 | |||
| 381 | The ``early_hits`` metric measures the likelihood that a given idle state will | ||
| 382 | "match" the observed (post-wakeup) idle duration if it does not "match" the | ||
| 383 | sleep length. It is subject to decay on every CPU wakeup and it is increased | ||
| 384 | when the idle state corresponding to it "matches" the observed (post-wakeup) | ||
| 385 | idle duration and the target residency of the next idle state is less than or | ||
| 386 | equal to the sleep length (i.e. the idle state "matching" the sleep length is | ||
| 387 | deeper than the given one). | ||
| 388 | |||
| 389 | The governor walks the list of idle states provided by the ``CPUIdle`` driver | ||
| 390 | and finds the last (deepest) one with the target residency less than or equal | ||
| 391 | to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle | ||
| 392 | state are compared with each other and it is preselected if the ``hits`` one is | ||
| 393 | greater (which means that that idle state is likely to "match" the observed idle | ||
| 394 | duration after CPU wakeup). If the ``misses`` one is greater, the governor | ||
| 395 | preselects the shallower idle state with the maximum ``early_hits`` metric | ||
| 396 | (or if there are multiple shallower idle states with equal ``early_hits`` | ||
| 397 | metric which also is the maximum, the shallowest of them will be preselected). | ||
| 398 | [If there is a wakeup latency constraint coming from the `PM QoS framework | ||
| 399 | <cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the | ||
| 400 | target residency within the sleep length, the deepest idle state with the exit | ||
| 401 | latency within the constraint is preselected without consulting the ``hits``, | ||
| 402 | ``misses`` and ``early_hits`` metrics.] | ||
| 403 | |||
| 404 | Next, the governor takes several idle duration values observed most recently | ||
| 405 | into consideration and if at least a half of them are greater than or equal to | ||
| 406 | the target residency of the preselected idle state, that idle state becomes the | ||
| 407 | final candidate to ask for. Otherwise, the average of the most recent idle | ||
| 408 | duration values below the target residency of the preselected idle state is | ||
| 409 | computed and the governor walks the idle states shallower than the preselected | ||
| 410 | one and finds the deepest of them with the target residency within that average. | ||
| 411 | That idle state is then taken as the final candidate to ask for. | ||
| 412 | |||
| 413 | Still, at this point the governor may need to refine the idle state selection if | ||
| 414 | it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That | ||
| 415 | generally happens if the target residency of the idle state selected so far is | ||
| 416 | less than the tick period and the tick has not been stopped already (in a | ||
| 417 | previous iteration of the idle loop). Then, like in the ``menu`` governor | ||
| 418 | `case <menu-gov_>`_, the sleep length used in the previous computations may not | ||
| 419 | reflect the real time until the closest timer event and if it really is greater | ||
| 420 | than that time, a shallower state with a suitable target residency may need to | ||
| 421 | be selected. | ||
| 422 | |||
| 423 | |||
| 336 | .. _idle-states-representation: | 424 | .. _idle-states-representation: |
| 337 | 425 | ||
| 338 | Representation of Idle States | 426 | Representation of Idle States |
diff --git a/Documentation/cpuidle/driver.txt b/Documentation/cpuidle/driver.txt deleted file mode 100644 index 1b0d81d92583..000000000000 --- a/Documentation/cpuidle/driver.txt +++ /dev/null | |||
| @@ -1,37 +0,0 @@ | |||
| 1 | |||
| 2 | |||
| 3 | Supporting multiple CPU idle levels in kernel | ||
| 4 | |||
| 5 | cpuidle drivers | ||
| 6 | |||
| 7 | |||
| 8 | |||
| 9 | |||
| 10 | cpuidle driver hooks into the cpuidle infrastructure and handles the | ||
| 11 | architecture/platform dependent part of CPU idle states. Driver | ||
| 12 | provides the platform idle state detection capability and also | ||
| 13 | has mechanisms in place to support actual entry-exit into CPU idle states. | ||
| 14 | |||
| 15 | cpuidle driver initializes the cpuidle_device structure for each CPU device | ||
| 16 | and registers with cpuidle using cpuidle_register_device. | ||
| 17 | |||
| 18 | If all the idle states are the same, the wrapper function cpuidle_register | ||
| 19 | could be used instead. | ||
| 20 | |||
| 21 | It can also support the dynamic changes (like battery <-> AC), by using | ||
| 22 | cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device, | ||
| 23 | cpuidle_resume_and_unlock. | ||
| 24 | |||
| 25 | Interfaces: | ||
| 26 | extern int cpuidle_register(struct cpuidle_driver *drv, | ||
| 27 | const struct cpumask *const coupled_cpus); | ||
| 28 | extern int cpuidle_unregister(struct cpuidle_driver *drv); | ||
| 29 | extern int cpuidle_register_driver(struct cpuidle_driver *drv); | ||
| 30 | extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); | ||
| 31 | extern int cpuidle_register_device(struct cpuidle_device *dev); | ||
| 32 | extern void cpuidle_unregister_device(struct cpuidle_device *dev); | ||
| 33 | |||
| 34 | extern void cpuidle_pause_and_lock(void); | ||
| 35 | extern void cpuidle_resume_and_unlock(void); | ||
| 36 | extern int cpuidle_enable_device(struct cpuidle_device *dev); | ||
| 37 | extern void cpuidle_disable_device(struct cpuidle_device *dev); | ||
diff --git a/Documentation/cpuidle/governor.txt b/Documentation/cpuidle/governor.txt deleted file mode 100644 index d9020f5e847b..000000000000 --- a/Documentation/cpuidle/governor.txt +++ /dev/null | |||
| @@ -1,28 +0,0 @@ | |||
| 1 | |||
| 2 | |||
| 3 | |||
| 4 | Supporting multiple CPU idle levels in kernel | ||
| 5 | |||
| 6 | cpuidle governors | ||
| 7 | |||
| 8 | |||
| 9 | |||
| 10 | |||
| 11 | cpuidle governor is policy routine that decides what idle state to enter at | ||
| 12 | any given time. cpuidle core uses different callbacks to the governor. | ||
| 13 | |||
| 14 | * enable() to enable governor for a particular device | ||
| 15 | * disable() to disable governor for a particular device | ||
| 16 | * select() to select an idle state to enter | ||
| 17 | * reflect() called after returning from the idle state, which can be used | ||
| 18 | by the governor for some record keeping. | ||
| 19 | |||
| 20 | More than one governor can be registered at the same time and | ||
| 21 | users can switch between drivers using /sysfs interface (when enabled). | ||
| 22 | More than one governor part is supported for developers to easily experiment | ||
| 23 | with different governors. By default, most optimal governor based on your | ||
| 24 | kernel configuration and platform will be selected by cpuidle. | ||
| 25 | |||
| 26 | Interfaces: | ||
| 27 | extern int cpuidle_register_governor(struct cpuidle_governor *gov); | ||
| 28 | struct cpuidle_governor | ||
diff --git a/Documentation/driver-api/pm/cpuidle.rst b/Documentation/driver-api/pm/cpuidle.rst new file mode 100644 index 000000000000..5842ab621a58 --- /dev/null +++ b/Documentation/driver-api/pm/cpuidle.rst | |||
| @@ -0,0 +1,282 @@ | |||
| 1 | .. |struct cpuidle_governor| replace:: :c:type:`struct cpuidle_governor <cpuidle_governor>` | ||
| 2 | .. |struct cpuidle_device| replace:: :c:type:`struct cpuidle_device <cpuidle_device>` | ||
| 3 | .. |struct cpuidle_driver| replace:: :c:type:`struct cpuidle_driver <cpuidle_driver>` | ||
| 4 | .. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>` | ||
| 5 | |||
| 6 | ======================== | ||
| 7 | CPU Idle Time Management | ||
| 8 | ======================== | ||
| 9 | |||
| 10 | :: | ||
| 11 | |||
| 12 | Copyright (c) 2019 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com> | ||
| 13 | |||
| 14 | |||
| 15 | CPU Idle Time Management Subsystem | ||
| 16 | ================================== | ||
| 17 | |||
| 18 | Every time one of the logical CPUs in the system (the entities that appear to | ||
| 19 | fetch and execute instructions: hardware threads, if present, or processor | ||
| 20 | cores) is idle after an interrupt or equivalent wakeup event, which means that | ||
| 21 | there are no tasks to run on it except for the special "idle" task associated | ||
| 22 | with it, there is an opportunity to save energy for the processor that it | ||
| 23 | belongs to. That can be done by making the idle logical CPU stop fetching | ||
| 24 | instructions from memory and putting some of the processor's functional units | ||
| 25 | depended on by it into an idle state in which they will draw less power. | ||
| 26 | |||
| 27 | However, there may be multiple different idle states that can be used in such a | ||
| 28 | situation in principle, so it may be necessary to find the most suitable one | ||
| 29 | (from the kernel perspective) and ask the processor to use (or "enter") that | ||
| 30 | particular idle state. That is the role of the CPU idle time management | ||
| 31 | subsystem in the kernel, called ``CPUIdle``. | ||
| 32 | |||
| 33 | The design of ``CPUIdle`` is modular and based on the code duplication avoidance | ||
| 34 | principle, so the generic code that in principle need not depend on the hardware | ||
| 35 | or platform design details in it is separate from the code that interacts with | ||
| 36 | the hardware. It generally is divided into three categories of functional | ||
| 37 | units: *governors* responsible for selecting idle states to ask the processor | ||
| 38 | to enter, *drivers* that pass the governors' decisions on to the hardware and | ||
| 39 | the *core* providing a common framework for them. | ||
| 40 | |||
| 41 | |||
| 42 | CPU Idle Time Governors | ||
| 43 | ======================= | ||
| 44 | |||
| 45 | A CPU idle time (``CPUIdle``) governor is a bundle of policy code invoked when | ||
| 46 | one of the logical CPUs in the system turns out to be idle. Its role is to | ||
| 47 | select an idle state to ask the processor to enter in order to save some energy. | ||
| 48 | |||
| 49 | ``CPUIdle`` governors are generic and each of them can be used on any hardware | ||
| 50 | platform that the Linux kernel can run on. For this reason, data structures | ||
| 51 | operated on by them cannot depend on any hardware architecture or platform | ||
| 52 | design details as well. | ||
| 53 | |||
| 54 | The governor itself is represented by a |struct cpuidle_governor| object | ||
| 55 | containing four callback pointers, :c:member:`enable`, :c:member:`disable`, | ||
| 56 | :c:member:`select`, :c:member:`reflect`, a :c:member:`rating` field described | ||
| 57 | below, and a name (string) used for identifying it. | ||
| 58 | |||
| 59 | For the governor to be available at all, that object needs to be registered | ||
| 60 | with the ``CPUIdle`` core by calling :c:func:`cpuidle_register_governor()` with | ||
| 61 | a pointer to it passed as the argument. If successful, that causes the core to | ||
| 62 | add the governor to the global list of available governors and, if it is the | ||
| 63 | only one in the list (that is, the list was empty before) or the value of its | ||
| 64 | :c:member:`rating` field is greater than the value of that field for the | ||
| 65 | governor currently in use, or the name of the new governor was passed to the | ||
| 66 | kernel as the value of the ``cpuidle.governor=`` command line parameter, the new | ||
| 67 | governor will be used from that point on (there can be only one ``CPUIdle`` | ||
| 68 | governor in use at a time). Also, if ``cpuidle_sysfs_switch`` is passed to the | ||
| 69 | kernel in the command line, user space can choose the ``CPUIdle`` governor to | ||
| 70 | use at run time via ``sysfs``. | ||
| 71 | |||
| 72 | Once registered, ``CPUIdle`` governors cannot be unregistered, so it is not | ||
| 73 | practical to put them into loadable kernel modules. | ||
| 74 | |||
| 75 | The interface between ``CPUIdle`` governors and the core consists of four | ||
| 76 | callbacks: | ||
| 77 | |||
| 78 | :c:member:`enable` | ||
| 79 | :: | ||
| 80 | |||
| 81 | int (*enable) (struct cpuidle_driver *drv, struct cpuidle_device *dev); | ||
| 82 | |||
| 83 | The role of this callback is to prepare the governor for handling the | ||
| 84 | (logical) CPU represented by the |struct cpuidle_device| object pointed | ||
| 85 | to by the ``dev`` argument. The |struct cpuidle_driver| object pointed | ||
| 86 | to by the ``drv`` argument represents the ``CPUIdle`` driver to be used | ||
| 87 | with that CPU (among other things, it should contain the list of | ||
| 88 | |struct cpuidle_state| objects representing idle states that the | ||
| 89 | processor holding the given CPU can be asked to enter). | ||
| 90 | |||
| 91 | It may fail, in which case it is expected to return a negative error | ||
| 92 | code, and that causes the kernel to run the architecture-specific | ||
| 93 | default code for idle CPUs on the CPU in question instead of ``CPUIdle`` | ||
| 94 | until the ``->enable()`` governor callback is invoked for that CPU | ||
| 95 | again. | ||
| 96 | |||
| 97 | :c:member:`disable` | ||
| 98 | :: | ||
| 99 | |||
| 100 | void (*disable) (struct cpuidle_driver *drv, struct cpuidle_device *dev); | ||
| 101 | |||
| 102 | Called to make the governor stop handling the (logical) CPU represented | ||
| 103 | by the |struct cpuidle_device| object pointed to by the ``dev`` | ||
| 104 | argument. | ||
| 105 | |||
| 106 | It is expected to reverse any changes made by the ``->enable()`` | ||
| 107 | callback when it was last invoked for the target CPU, free all memory | ||
| 108 | allocated by that callback and so on. | ||
| 109 | |||
| 110 | :c:member:`select` | ||
| 111 | :: | ||
| 112 | |||
| 113 | int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev, | ||
| 114 | bool *stop_tick); | ||
| 115 | |||
| 116 | Called to select an idle state for the processor holding the (logical) | ||
| 117 | CPU represented by the |struct cpuidle_device| object pointed to by the | ||
| 118 | ``dev`` argument. | ||
| 119 | |||
| 120 | The list of idle states to take into consideration is represented by the | ||
| 121 | :c:member:`states` array of |struct cpuidle_state| objects held by the | ||
| 122 | |struct cpuidle_driver| object pointed to by the ``drv`` argument (which | ||
| 123 | represents the ``CPUIdle`` driver to be used with the CPU at hand). The | ||
| 124 | value returned by this callback is interpreted as an index into that | ||
| 125 | array (unless it is a negative error code). | ||
| 126 | |||
| 127 | The ``stop_tick`` argument is used to indicate whether or not to stop | ||
| 128 | the scheduler tick before asking the processor to enter the selected | ||
| 129 | idle state. When the ``bool`` variable pointed to by it (which is set | ||
| 130 | to ``true`` before invoking this callback) is cleared to ``false``, the | ||
| 131 | processor will be asked to enter the selected idle state without | ||
| 132 | stopping the scheduler tick on the given CPU (if the tick has been | ||
| 133 | stopped on that CPU already, however, it will not be restarted before | ||
| 134 | asking the processor to enter the idle state). | ||
| 135 | |||
| 136 | This callback is mandatory (i.e. the :c:member:`select` callback pointer | ||
| 137 | in |struct cpuidle_governor| must not be ``NULL`` for the registration | ||
| 138 | of the governor to succeed). | ||
| 139 | |||
| 140 | :c:member:`reflect` | ||
| 141 | :: | ||
| 142 | |||
| 143 | void (*reflect) (struct cpuidle_device *dev, int index); | ||
| 144 | |||
| 145 | Called to allow the governor to evaluate the accuracy of the idle state | ||
| 146 | selection made by the ``->select()`` callback (when it was invoked last | ||
| 147 | time) and possibly use the result of that to improve the accuracy of | ||
| 148 | idle state selections in the future. | ||
| 149 | |||
| 150 | In addition, ``CPUIdle`` governors are required to take power management | ||
| 151 | quality of service (PM QoS) constraints on the processor wakeup latency into | ||
| 152 | account when selecting idle states. In order to obtain the current effective | ||
| 153 | PM QoS wakeup latency constraint for a given CPU, a ``CPUIdle`` governor is | ||
| 154 | expected to pass the number of the CPU to | ||
| 155 | :c:func:`cpuidle_governor_latency_req()`. Then, the governor's ``->select()`` | ||
| 156 | callback must not return the index of an indle state whose | ||
| 157 | :c:member:`exit_latency` value is greater than the number returned by that | ||
| 158 | function. | ||
| 159 | |||
| 160 | |||
| 161 | CPU Idle Time Management Drivers | ||
| 162 | ================================ | ||
| 163 | |||
| 164 | CPU idle time management (``CPUIdle``) drivers provide an interface between the | ||
| 165 | other parts of ``CPUIdle`` and the hardware. | ||
| 166 | |||
| 167 | First of all, a ``CPUIdle`` driver has to populate the :c:member:`states` array | ||
| 168 | of |struct cpuidle_state| objects included in the |struct cpuidle_driver| object | ||
| 169 | representing it. Going forward this array will represent the list of available | ||
| 170 | idle states that the processor hardware can be asked to enter shared by all of | ||
| 171 | the logical CPUs handled by the given driver. | ||
| 172 | |||
| 173 | The entries in the :c:member:`states` array are expected to be sorted by the | ||
| 174 | value of the :c:member:`target_residency` field in |struct cpuidle_state| in | ||
| 175 | the ascending order (that is, index 0 should correspond to the idle state with | ||
| 176 | the minimum value of :c:member:`target_residency`). [Since the | ||
| 177 | :c:member:`target_residency` value is expected to reflect the "depth" of the | ||
| 178 | idle state represented by the |struct cpuidle_state| object holding it, this | ||
| 179 | sorting order should be the same as the ascending sorting order by the idle | ||
| 180 | state "depth".] | ||
| 181 | |||
| 182 | Three fields in |struct cpuidle_state| are used by the existing ``CPUIdle`` | ||
| 183 | governors for computations related to idle state selection: | ||
| 184 | |||
| 185 | :c:member:`target_residency` | ||
| 186 | Minimum time to spend in this idle state including the time needed to | ||
| 187 | enter it (which may be substantial) to save more energy than could | ||
| 188 | be saved by staying in a shallower idle state for the same amount of | ||
| 189 | time, in microseconds. | ||
| 190 | |||
| 191 | :c:member:`exit_latency` | ||
| 192 | Maximum time it will take a CPU asking the processor to enter this idle | ||
| 193 | state to start executing the first instruction after a wakeup from it, | ||
| 194 | in microseconds. | ||
| 195 | |||
| 196 | :c:member:`flags` | ||
| 197 | Flags representing idle state properties. Currently, governors only use | ||
| 198 | the ``CPUIDLE_FLAG_POLLING`` flag which is set if the given object | ||
| 199 | does not represent a real idle state, but an interface to a software | ||
| 200 | "loop" that can be used in order to avoid asking the processor to enter | ||
| 201 | any idle state at all. [There are other flags used by the ``CPUIdle`` | ||
| 202 | core in special situations.] | ||
| 203 | |||
| 204 | The :c:member:`enter` callback pointer in |struct cpuidle_state|, which must not | ||
| 205 | be ``NULL``, points to the routine to execute in order to ask the processor to | ||
| 206 | enter this particular idle state: | ||
| 207 | |||
| 208 | :: | ||
| 209 | |||
| 210 | void (*enter) (struct cpuidle_device *dev, struct cpuidle_driver *drv, | ||
| 211 | int index); | ||
| 212 | |||
| 213 | The first two arguments of it point to the |struct cpuidle_device| object | ||
| 214 | representing the logical CPU running this callback and the | ||
| 215 | |struct cpuidle_driver| object representing the driver itself, respectively, | ||
| 216 | and the last one is an index of the |struct cpuidle_state| entry in the driver's | ||
| 217 | :c:member:`states` array representing the idle state to ask the processor to | ||
| 218 | enter. | ||
| 219 | |||
| 220 | The analogous ``->enter_s2idle()`` callback in |struct cpuidle_state| is used | ||
| 221 | only for implementing the suspend-to-idle system-wide power management feature. | ||
| 222 | The difference between in and ``->enter()`` is that it must not re-enable | ||
| 223 | interrupts at any point (even temporarily) or attempt to change the states of | ||
| 224 | clock event devices, which the ``->enter()`` callback may do sometimes. | ||
| 225 | |||
| 226 | Once the :c:member:`states` array has been populated, the number of valid | ||
| 227 | entries in it has to be stored in the :c:member:`state_count` field of the | ||
| 228 | |struct cpuidle_driver| object representing the driver. Moreover, if any | ||
| 229 | entries in the :c:member:`states` array represent "coupled" idle states (that | ||
| 230 | is, idle states that can only be asked for if multiple related logical CPUs are | ||
| 231 | idle), the :c:member:`safe_state_index` field in |struct cpuidle_driver| needs | ||
| 232 | to be the index of an idle state that is not "coupled" (that is, one that can be | ||
| 233 | asked for if only one logical CPU is idle). | ||
| 234 | |||
| 235 | In addition to that, if the given ``CPUIdle`` driver is only going to handle a | ||
| 236 | subset of logical CPUs in the system, the :c:member:`cpumask` field in its | ||
| 237 | |struct cpuidle_driver| object must point to the set (mask) of CPUs that will be | ||
| 238 | handled by it. | ||
| 239 | |||
| 240 | A ``CPUIdle`` driver can only be used after it has been registered. If there | ||
| 241 | are no "coupled" idle state entries in the driver's :c:member:`states` array, | ||
| 242 | that can be accomplished by passing the driver's |struct cpuidle_driver| object | ||
| 243 | to :c:func:`cpuidle_register_driver()`. Otherwise, :c:func:`cpuidle_register()` | ||
| 244 | should be used for this purpose. | ||
| 245 | |||
| 246 | However, it also is necessary to register |struct cpuidle_device| objects for | ||
| 247 | all of the logical CPUs to be handled by the given ``CPUIdle`` driver with the | ||
| 248 | help of :c:func:`cpuidle_register_device()` after the driver has been registered | ||
| 249 | and :c:func:`cpuidle_register_driver()`, unlike :c:func:`cpuidle_register()`, | ||
| 250 | does not do that automatically. For this reason, the drivers that use | ||
| 251 | :c:func:`cpuidle_register_driver()` to register themselves must also take care | ||
| 252 | of registering the |struct cpuidle_device| objects as needed, so it is generally | ||
| 253 | recommended to use :c:func:`cpuidle_register()` for ``CPUIdle`` driver | ||
| 254 | registration in all cases. | ||
| 255 | |||
| 256 | The registration of a |struct cpuidle_device| object causes the ``CPUIdle`` | ||
| 257 | ``sysfs`` interface to be created and the governor's ``->enable()`` callback to | ||
| 258 | be invoked for the logical CPU represented by it, so it must take place after | ||
| 259 | registering the driver that will handle the CPU in question. | ||
| 260 | |||
| 261 | ``CPUIdle`` drivers and |struct cpuidle_device| objects can be unregistered | ||
| 262 | when they are not necessary any more which allows some resources associated with | ||
| 263 | them to be released. Due to dependencies between them, all of the | ||
| 264 | |struct cpuidle_device| objects representing CPUs handled by the given | ||
| 265 | ``CPUIdle`` driver must be unregistered, with the help of | ||
| 266 | :c:func:`cpuidle_unregister_device()`, before calling | ||
| 267 | :c:func:`cpuidle_unregister_driver()` to unregister the driver. Alternatively, | ||
| 268 | :c:func:`cpuidle_unregister()` can be called to unregister a ``CPUIdle`` driver | ||
| 269 | along with all of the |struct cpuidle_device| objects representing CPUs handled | ||
| 270 | by it. | ||
| 271 | |||
| 272 | ``CPUIdle`` drivers can respond to runtime system configuration changes that | ||
| 273 | lead to modifications of the list of available processor idle states (which can | ||
| 274 | happen, for example, when the system's power source is switched from AC to | ||
| 275 | battery or the other way around). Upon a notification of such a change, | ||
| 276 | a ``CPUIdle`` driver is expected to call :c:func:`cpuidle_pause_and_lock()` to | ||
| 277 | turn ``CPUIdle`` off temporarily and then :c:func:`cpuidle_disable_device()` for | ||
| 278 | all of the |struct cpuidle_device| objects representing CPUs affected by that | ||
| 279 | change. Next, it can update its :c:member:`states` array in accordance with | ||
| 280 | the new configuration of the system, call :c:func:`cpuidle_enable_device()` for | ||
| 281 | all of the relevant |struct cpuidle_device| objects and invoke | ||
| 282 | :c:func:`cpuidle_resume_and_unlock()` to allow ``CPUIdle`` to be used again. | ||
diff --git a/Documentation/driver-api/pm/index.rst b/Documentation/driver-api/pm/index.rst index 2f6d0e9cf6b7..56975c6bc789 100644 --- a/Documentation/driver-api/pm/index.rst +++ b/Documentation/driver-api/pm/index.rst | |||
| @@ -1,9 +1,10 @@ | |||
| 1 | ======================= | 1 | =============================== |
| 2 | Device Power Management | 2 | CPU and Device Power Management |
| 3 | ======================= | 3 | =============================== |
| 4 | 4 | ||
| 5 | .. toctree:: | 5 | .. toctree:: |
| 6 | 6 | ||
| 7 | cpuidle | ||
| 7 | devices | 8 | devices |
| 8 | notifiers | 9 | notifiers |
| 9 | types | 10 | types |
diff --git a/MAINTAINERS b/MAINTAINERS index 9f64f8d3740e..b053a355894e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -4016,6 +4016,7 @@ S: Maintained | |||
| 4016 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git | 4016 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git |
| 4017 | B: https://bugzilla.kernel.org | 4017 | B: https://bugzilla.kernel.org |
| 4018 | F: Documentation/admin-guide/pm/cpuidle.rst | 4018 | F: Documentation/admin-guide/pm/cpuidle.rst |
| 4019 | F: Documentation/driver-api/pm/cpuidle.rst | ||
| 4019 | F: drivers/cpuidle/* | 4020 | F: drivers/cpuidle/* |
| 4020 | F: include/linux/cpuidle.h | 4021 | F: include/linux/cpuidle.h |
| 4021 | 4022 | ||
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 7e48eb5bf0a7..8caccbbd7353 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig | |||
| @@ -4,7 +4,7 @@ config CPU_IDLE | |||
| 4 | bool "CPU idle PM support" | 4 | bool "CPU idle PM support" |
| 5 | default y if ACPI || PPC_PSERIES | 5 | default y if ACPI || PPC_PSERIES |
| 6 | select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) | 6 | select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) |
| 7 | select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) | 7 | select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO |
| 8 | help | 8 | help |
| 9 | CPU idle is a generic framework for supporting software-controlled | 9 | CPU idle is a generic framework for supporting software-controlled |
| 10 | idle processor power management. It includes modular cross-platform | 10 | idle processor power management. It includes modular cross-platform |
| @@ -23,6 +23,15 @@ config CPU_IDLE_GOV_LADDER | |||
| 23 | config CPU_IDLE_GOV_MENU | 23 | config CPU_IDLE_GOV_MENU |
| 24 | bool "Menu governor (for tickless system)" | 24 | bool "Menu governor (for tickless system)" |
| 25 | 25 | ||
| 26 | config CPU_IDLE_GOV_TEO | ||
| 27 | bool "Timer events oriented (TEO) governor (for tickless systems)" | ||
| 28 | help | ||
| 29 | This governor implements a simplified idle state selection method | ||
| 30 | focused on timer events and does not do any interactivity boosting. | ||
| 31 | |||
| 32 | Some workloads benefit from using it and it generally should be safe | ||
| 33 | to use. Say Y here if you are not happy with the alternatives. | ||
| 34 | |||
| 26 | config DT_IDLE_STATES | 35 | config DT_IDLE_STATES |
| 27 | bool | 36 | bool |
| 28 | 37 | ||
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 1b512722689f..4d8aff5248a8 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile | |||
| @@ -4,3 +4,4 @@ | |||
| 4 | 4 | ||
| 5 | obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o | 5 | obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o |
| 6 | obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o | 6 | obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o |
| 7 | obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o | ||
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c new file mode 100644 index 000000000000..7d05efdbd3c6 --- /dev/null +++ b/drivers/cpuidle/governors/teo.c | |||
| @@ -0,0 +1,444 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Timer events oriented CPU idle governor | ||
| 4 | * | ||
| 5 | * Copyright (C) 2018 Intel Corporation | ||
| 6 | * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> | ||
| 7 | * | ||
| 8 | * The idea of this governor is based on the observation that on many systems | ||
| 9 | * timer events are two or more orders of magnitude more frequent than any | ||
| 10 | * other interrupts, so they are likely to be the most significant source of CPU | ||
| 11 | * wakeups from idle states. Moreover, information about what happened in the | ||
| 12 | * (relatively recent) past can be used to estimate whether or not the deepest | ||
| 13 | * idle state with target residency within the time to the closest timer is | ||
| 14 | * likely to be suitable for the upcoming idle time of the CPU and, if not, then | ||
| 15 | * which of the shallower idle states to choose. | ||
| 16 | * | ||
| 17 | * Of course, non-timer wakeup sources are more important in some use cases and | ||
| 18 | * they can be covered by taking a few most recent idle time intervals of the | ||
| 19 | * CPU into account. However, even in that case it is not necessary to consider | ||
| 20 | * idle duration values greater than the time till the closest timer, as the | ||
| 21 | * patterns that they may belong to produce average values close enough to | ||
| 22 | * the time till the closest timer (sleep length) anyway. | ||
| 23 | * | ||
| 24 | * Thus this governor estimates whether or not the upcoming idle time of the CPU | ||
| 25 | * is likely to be significantly shorter than the sleep length and selects an | ||
| 26 | * idle state for it in accordance with that, as follows: | ||
| 27 | * | ||
| 28 | * - Find an idle state on the basis of the sleep length and state statistics | ||
| 29 | * collected over time: | ||
| 30 | * | ||
| 31 | * o Find the deepest idle state whose target residency is less than or equal | ||
| 32 | * to the sleep length. | ||
| 33 | * | ||
| 34 | * o Select it if it matched both the sleep length and the observed idle | ||
| 35 | * duration in the past more often than it matched the sleep length alone | ||
| 36 | * (i.e. the observed idle duration was significantly shorter than the sleep | ||
| 37 | * length matched by it). | ||
| 38 | * | ||
| 39 | * o Otherwise, select the shallower state with the greatest matched "early" | ||
| 40 | * wakeups metric. | ||
| 41 | * | ||
| 42 | * - If the majority of the most recent idle duration values are below the | ||
| 43 | * target residency of the idle state selected so far, use those values to | ||
| 44 | * compute the new expected idle duration and find an idle state matching it | ||
| 45 | * (which has to be shallower than the one selected so far). | ||
| 46 | */ | ||
| 47 | |||
| 48 | #include <linux/cpuidle.h> | ||
| 49 | #include <linux/jiffies.h> | ||
| 50 | #include <linux/kernel.h> | ||
| 51 | #include <linux/sched/clock.h> | ||
| 52 | #include <linux/tick.h> | ||
| 53 | |||
| 54 | /* | ||
| 55 | * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value | ||
| 56 | * is used for decreasing metrics on a regular basis. | ||
| 57 | */ | ||
| 58 | #define PULSE 1024 | ||
| 59 | #define DECAY_SHIFT 3 | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Number of the most recent idle duration values to take into consideration for | ||
| 63 | * the detection of wakeup patterns. | ||
| 64 | */ | ||
| 65 | #define INTERVALS 8 | ||
| 66 | |||
| 67 | /** | ||
| 68 | * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. | ||
| 69 | * @early_hits: "Early" CPU wakeups "matching" this state. | ||
| 70 | * @hits: "On time" CPU wakeups "matching" this state. | ||
| 71 | * @misses: CPU wakeups "missing" this state. | ||
| 72 | * | ||
| 73 | * A CPU wakeup is "matched" by a given idle state if the idle duration measured | ||
| 74 | * after the wakeup is between the target residency of that state and the target | ||
| 75 | * residency of the next one (or if this is the deepest available idle state, it | ||
| 76 | * "matches" a CPU wakeup when the measured idle duration is at least equal to | ||
| 77 | * its target residency). | ||
| 78 | * | ||
| 79 | * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if | ||
| 80 | * it occurs significantly earlier than the closest expected timer event (that | ||
| 81 | * is, early enough to match an idle state shallower than the one matching the | ||
| 82 | * time till the closest timer event). Otherwise, the wakeup is "on time", or | ||
| 83 | * it is a "hit". | ||
| 84 | * | ||
| 85 | * A "miss" occurs when the given state doesn't match the wakeup, but it matches | ||
| 86 | * the time till the closest timer event used for idle state selection. | ||
| 87 | */ | ||
| 88 | struct teo_idle_state { | ||
| 89 | unsigned int early_hits; | ||
| 90 | unsigned int hits; | ||
| 91 | unsigned int misses; | ||
| 92 | }; | ||
| 93 | |||
| 94 | /** | ||
| 95 | * struct teo_cpu - CPU data used by the TEO cpuidle governor. | ||
| 96 | * @time_span_ns: Time between idle state selection and post-wakeup update. | ||
| 97 | * @sleep_length_ns: Time till the closest timer event (at the selection time). | ||
| 98 | * @states: Idle states data corresponding to this CPU. | ||
| 99 | * @last_state: Idle state entered by the CPU last time. | ||
| 100 | * @interval_idx: Index of the most recent saved idle interval. | ||
| 101 | * @intervals: Saved idle duration values. | ||
| 102 | */ | ||
| 103 | struct teo_cpu { | ||
| 104 | u64 time_span_ns; | ||
| 105 | u64 sleep_length_ns; | ||
| 106 | struct teo_idle_state states[CPUIDLE_STATE_MAX]; | ||
| 107 | int last_state; | ||
| 108 | int interval_idx; | ||
| 109 | unsigned int intervals[INTERVALS]; | ||
| 110 | }; | ||
| 111 | |||
| 112 | static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); | ||
| 113 | |||
| 114 | /** | ||
| 115 | * teo_update - Update CPU data after wakeup. | ||
| 116 | * @drv: cpuidle driver containing state data. | ||
| 117 | * @dev: Target CPU. | ||
| 118 | */ | ||
| 119 | static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) | ||
| 120 | { | ||
| 121 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
| 122 | unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); | ||
| 123 | int i, idx_hit = -1, idx_timer = -1; | ||
| 124 | unsigned int measured_us; | ||
| 125 | |||
| 126 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { | ||
| 127 | /* | ||
| 128 | * One of the safety nets has triggered or this was a timer | ||
| 129 | * wakeup (or equivalent). | ||
| 130 | */ | ||
| 131 | measured_us = sleep_length_us; | ||
| 132 | } else { | ||
| 133 | unsigned int lat = drv->states[cpu_data->last_state].exit_latency; | ||
| 134 | |||
| 135 | measured_us = ktime_to_us(cpu_data->time_span_ns); | ||
| 136 | /* | ||
| 137 | * The delay between the wakeup and the first instruction | ||
| 138 | * executed by the CPU is not likely to be worst-case every | ||
| 139 | * time, so take 1/2 of the exit latency as a very rough | ||
| 140 | * approximation of the average of it. | ||
| 141 | */ | ||
| 142 | if (measured_us >= lat) | ||
| 143 | measured_us -= lat / 2; | ||
| 144 | else | ||
| 145 | measured_us /= 2; | ||
| 146 | } | ||
| 147 | |||
| 148 | /* | ||
| 149 | * Decay the "early hits" metric for all of the states and find the | ||
| 150 | * states matching the sleep length and the measured idle duration. | ||
| 151 | */ | ||
| 152 | for (i = 0; i < drv->state_count; i++) { | ||
| 153 | unsigned int early_hits = cpu_data->states[i].early_hits; | ||
| 154 | |||
| 155 | cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; | ||
| 156 | |||
| 157 | if (drv->states[i].target_residency <= sleep_length_us) { | ||
| 158 | idx_timer = i; | ||
| 159 | if (drv->states[i].target_residency <= measured_us) | ||
| 160 | idx_hit = i; | ||
| 161 | } | ||
| 162 | } | ||
| 163 | |||
| 164 | /* | ||
| 165 | * Update the "hits" and "misses" data for the state matching the sleep | ||
| 166 | * length. If it matches the measured idle duration too, this is a hit, | ||
| 167 | * so increase the "hits" metric for it then. Otherwise, this is a | ||
| 168 | * miss, so increase the "misses" metric for it. In the latter case | ||
| 169 | * also increase the "early hits" metric for the state that actually | ||
| 170 | * matches the measured idle duration. | ||
| 171 | */ | ||
| 172 | if (idx_timer >= 0) { | ||
| 173 | unsigned int hits = cpu_data->states[idx_timer].hits; | ||
| 174 | unsigned int misses = cpu_data->states[idx_timer].misses; | ||
| 175 | |||
| 176 | hits -= hits >> DECAY_SHIFT; | ||
| 177 | misses -= misses >> DECAY_SHIFT; | ||
| 178 | |||
| 179 | if (idx_timer > idx_hit) { | ||
| 180 | misses += PULSE; | ||
| 181 | if (idx_hit >= 0) | ||
| 182 | cpu_data->states[idx_hit].early_hits += PULSE; | ||
| 183 | } else { | ||
| 184 | hits += PULSE; | ||
| 185 | } | ||
| 186 | |||
| 187 | cpu_data->states[idx_timer].misses = misses; | ||
| 188 | cpu_data->states[idx_timer].hits = hits; | ||
| 189 | } | ||
| 190 | |||
| 191 | /* | ||
| 192 | * If the total time span between idle state selection and the "reflect" | ||
| 193 | * callback is greater than or equal to the sleep length determined at | ||
| 194 | * the idle state selection time, the wakeup is likely to be due to a | ||
| 195 | * timer event. | ||
| 196 | */ | ||
| 197 | if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) | ||
| 198 | measured_us = UINT_MAX; | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Save idle duration values corresponding to non-timer wakeups for | ||
| 202 | * pattern detection. | ||
| 203 | */ | ||
| 204 | cpu_data->intervals[cpu_data->interval_idx++] = measured_us; | ||
| 205 | if (cpu_data->interval_idx > INTERVALS) | ||
| 206 | cpu_data->interval_idx = 0; | ||
| 207 | } | ||
| 208 | |||
| 209 | /** | ||
| 210 | * teo_find_shallower_state - Find shallower idle state matching given duration. | ||
| 211 | * @drv: cpuidle driver containing state data. | ||
| 212 | * @dev: Target CPU. | ||
| 213 | * @state_idx: Index of the capping idle state. | ||
| 214 | * @duration_us: Idle duration value to match. | ||
| 215 | */ | ||
| 216 | static int teo_find_shallower_state(struct cpuidle_driver *drv, | ||
| 217 | struct cpuidle_device *dev, int state_idx, | ||
| 218 | unsigned int duration_us) | ||
| 219 | { | ||
| 220 | int i; | ||
| 221 | |||
| 222 | for (i = state_idx - 1; i >= 0; i--) { | ||
| 223 | if (drv->states[i].disabled || dev->states_usage[i].disable) | ||
| 224 | continue; | ||
| 225 | |||
| 226 | state_idx = i; | ||
| 227 | if (drv->states[i].target_residency <= duration_us) | ||
| 228 | break; | ||
| 229 | } | ||
| 230 | return state_idx; | ||
| 231 | } | ||
| 232 | |||
| 233 | /** | ||
| 234 | * teo_select - Selects the next idle state to enter. | ||
| 235 | * @drv: cpuidle driver containing state data. | ||
| 236 | * @dev: Target CPU. | ||
| 237 | * @stop_tick: Indication on whether or not to stop the scheduler tick. | ||
| 238 | */ | ||
| 239 | static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, | ||
| 240 | bool *stop_tick) | ||
| 241 | { | ||
| 242 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
| 243 | int latency_req = cpuidle_governor_latency_req(dev->cpu); | ||
| 244 | unsigned int duration_us, count; | ||
| 245 | int max_early_idx, idx, i; | ||
| 246 | ktime_t delta_tick; | ||
| 247 | |||
| 248 | if (cpu_data->last_state >= 0) { | ||
| 249 | teo_update(drv, dev); | ||
| 250 | cpu_data->last_state = -1; | ||
| 251 | } | ||
| 252 | |||
| 253 | cpu_data->time_span_ns = local_clock(); | ||
| 254 | |||
| 255 | cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); | ||
| 256 | duration_us = ktime_to_us(cpu_data->sleep_length_ns); | ||
| 257 | |||
| 258 | count = 0; | ||
| 259 | max_early_idx = -1; | ||
| 260 | idx = -1; | ||
| 261 | |||
| 262 | for (i = 0; i < drv->state_count; i++) { | ||
| 263 | struct cpuidle_state *s = &drv->states[i]; | ||
| 264 | struct cpuidle_state_usage *su = &dev->states_usage[i]; | ||
| 265 | |||
| 266 | if (s->disabled || su->disable) { | ||
| 267 | /* | ||
| 268 | * If the "early hits" metric of a disabled state is | ||
| 269 | * greater than the current maximum, it should be taken | ||
| 270 | * into account, because it would be a mistake to select | ||
| 271 | * a deeper state with lower "early hits" metric. The | ||
| 272 | * index cannot be changed to point to it, however, so | ||
| 273 | * just increase the max count alone and let the index | ||
| 274 | * still point to a shallower idle state. | ||
| 275 | */ | ||
| 276 | if (max_early_idx >= 0 && | ||
| 277 | count < cpu_data->states[i].early_hits) | ||
| 278 | count = cpu_data->states[i].early_hits; | ||
| 279 | |||
| 280 | continue; | ||
| 281 | } | ||
| 282 | |||
| 283 | if (idx < 0) | ||
| 284 | idx = i; /* first enabled state */ | ||
| 285 | |||
| 286 | if (s->target_residency > duration_us) | ||
| 287 | break; | ||
| 288 | |||
| 289 | if (s->exit_latency > latency_req) { | ||
| 290 | /* | ||
| 291 | * If we break out of the loop for latency reasons, use | ||
| 292 | * the target residency of the selected state as the | ||
| 293 | * expected idle duration to avoid stopping the tick | ||
| 294 | * as long as that target residency is low enough. | ||
| 295 | */ | ||
| 296 | duration_us = drv->states[idx].target_residency; | ||
| 297 | goto refine; | ||
| 298 | } | ||
| 299 | |||
| 300 | idx = i; | ||
| 301 | |||
| 302 | if (count < cpu_data->states[i].early_hits && | ||
| 303 | !(tick_nohz_tick_stopped() && | ||
| 304 | drv->states[i].target_residency < TICK_USEC)) { | ||
| 305 | count = cpu_data->states[i].early_hits; | ||
| 306 | max_early_idx = i; | ||
| 307 | } | ||
| 308 | } | ||
| 309 | |||
| 310 | /* | ||
| 311 | * If the "hits" metric of the idle state matching the sleep length is | ||
| 312 | * greater than its "misses" metric, that is the one to use. Otherwise, | ||
| 313 | * it is more likely that one of the shallower states will match the | ||
| 314 | * idle duration observed after wakeup, so take the one with the maximum | ||
| 315 | * "early hits" metric, but if that cannot be determined, just use the | ||
| 316 | * state selected so far. | ||
| 317 | */ | ||
| 318 | if (cpu_data->states[idx].hits <= cpu_data->states[idx].misses && | ||
| 319 | max_early_idx >= 0) { | ||
| 320 | idx = max_early_idx; | ||
| 321 | duration_us = drv->states[idx].target_residency; | ||
| 322 | } | ||
| 323 | |||
| 324 | refine: | ||
| 325 | if (idx < 0) { | ||
| 326 | idx = 0; /* No states enabled. Must use 0. */ | ||
| 327 | } else if (idx > 0) { | ||
| 328 | u64 sum = 0; | ||
| 329 | |||
| 330 | count = 0; | ||
| 331 | |||
| 332 | /* | ||
| 333 | * Count and sum the most recent idle duration values less than | ||
| 334 | * the target residency of the state selected so far, find the | ||
| 335 | * max. | ||
| 336 | */ | ||
| 337 | for (i = 0; i < INTERVALS; i++) { | ||
| 338 | unsigned int val = cpu_data->intervals[i]; | ||
| 339 | |||
| 340 | if (val >= drv->states[idx].target_residency) | ||
| 341 | continue; | ||
| 342 | |||
| 343 | count++; | ||
| 344 | sum += val; | ||
| 345 | } | ||
| 346 | |||
| 347 | /* | ||
| 348 | * Give up unless the majority of the most recent idle duration | ||
| 349 | * values are in the interesting range. | ||
| 350 | */ | ||
| 351 | if (count > INTERVALS / 2) { | ||
| 352 | unsigned int avg_us = div64_u64(sum, count); | ||
| 353 | |||
| 354 | /* | ||
| 355 | * Avoid spending too much time in an idle state that | ||
| 356 | * would be too shallow. | ||
| 357 | */ | ||
| 358 | if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { | ||
| 359 | idx = teo_find_shallower_state(drv, dev, idx, avg_us); | ||
| 360 | duration_us = avg_us; | ||
| 361 | } | ||
| 362 | } | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Don't stop the tick if the selected state is a polling one or if the | ||
| 367 | * expected idle duration is shorter than the tick period length. | ||
| 368 | */ | ||
| 369 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || | ||
| 370 | duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { | ||
| 371 | unsigned int delta_tick_us = ktime_to_us(delta_tick); | ||
| 372 | |||
| 373 | *stop_tick = false; | ||
| 374 | |||
| 375 | /* | ||
| 376 | * The tick is not going to be stopped, so if the target | ||
| 377 | * residency of the state to be returned is not within the time | ||
| 378 | * till the closest timer including the tick, try to correct | ||
| 379 | * that. | ||
| 380 | */ | ||
| 381 | if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) | ||
| 382 | idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); | ||
| 383 | } | ||
| 384 | |||
| 385 | return idx; | ||
| 386 | } | ||
| 387 | |||
| 388 | /** | ||
| 389 | * teo_reflect - Note that governor data for the CPU need to be updated. | ||
| 390 | * @dev: Target CPU. | ||
| 391 | * @state: Entered state. | ||
| 392 | */ | ||
| 393 | static void teo_reflect(struct cpuidle_device *dev, int state) | ||
| 394 | { | ||
| 395 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
| 396 | |||
| 397 | cpu_data->last_state = state; | ||
| 398 | /* | ||
| 399 | * If the wakeup was not "natural", but triggered by one of the safety | ||
| 400 | * nets, assume that the CPU might have been idle for the entire sleep | ||
| 401 | * length time. | ||
| 402 | */ | ||
| 403 | if (dev->poll_time_limit || | ||
| 404 | (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { | ||
| 405 | dev->poll_time_limit = false; | ||
| 406 | cpu_data->time_span_ns = cpu_data->sleep_length_ns; | ||
| 407 | } else { | ||
| 408 | cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; | ||
| 409 | } | ||
| 410 | } | ||
| 411 | |||
| 412 | /** | ||
| 413 | * teo_enable_device - Initialize the governor's data for the target CPU. | ||
| 414 | * @drv: cpuidle driver (not used). | ||
| 415 | * @dev: Target CPU. | ||
| 416 | */ | ||
| 417 | static int teo_enable_device(struct cpuidle_driver *drv, | ||
| 418 | struct cpuidle_device *dev) | ||
| 419 | { | ||
| 420 | struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); | ||
| 421 | int i; | ||
| 422 | |||
| 423 | memset(cpu_data, 0, sizeof(*cpu_data)); | ||
| 424 | |||
| 425 | for (i = 0; i < INTERVALS; i++) | ||
| 426 | cpu_data->intervals[i] = UINT_MAX; | ||
| 427 | |||
| 428 | return 0; | ||
| 429 | } | ||
| 430 | |||
| 431 | static struct cpuidle_governor teo_governor = { | ||
| 432 | .name = "teo", | ||
| 433 | .rating = 19, | ||
| 434 | .enable = teo_enable_device, | ||
| 435 | .select = teo_select, | ||
| 436 | .reflect = teo_reflect, | ||
| 437 | }; | ||
| 438 | |||
| 439 | static int __init teo_governor_init(void) | ||
| 440 | { | ||
| 441 | return cpuidle_register_governor(&teo_governor); | ||
| 442 | } | ||
| 443 | |||
| 444 | postcore_initcall(teo_governor_init); | ||
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4dff74f48d4b..3b39472324a3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h | |||
| @@ -69,11 +69,9 @@ struct cpuidle_state { | |||
| 69 | 69 | ||
| 70 | /* Idle State Flags */ | 70 | /* Idle State Flags */ |
| 71 | #define CPUIDLE_FLAG_NONE (0x00) | 71 | #define CPUIDLE_FLAG_NONE (0x00) |
| 72 | #define CPUIDLE_FLAG_POLLING (0x01) /* polling state */ | 72 | #define CPUIDLE_FLAG_POLLING BIT(0) /* polling state */ |
| 73 | #define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ | 73 | #define CPUIDLE_FLAG_COUPLED BIT(1) /* state applies to multiple cpus */ |
| 74 | #define CPUIDLE_FLAG_TIMER_STOP (0x04) /* timer is stopped on this state */ | 74 | #define CPUIDLE_FLAG_TIMER_STOP BIT(2) /* timer is stopped on this state */ |
| 75 | |||
| 76 | #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) | ||
| 77 | 75 | ||
| 78 | struct cpuidle_device_kobj; | 76 | struct cpuidle_device_kobj; |
| 79 | struct cpuidle_state_kobj; | 77 | struct cpuidle_state_kobj; |
