summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-power45
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt22
-rw-r--r--Documentation/cpu-freq/cpufreq-stats.txt6
-rw-r--r--Documentation/cpu-freq/intel-pstate.txt54
-rw-r--r--Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt78
-rw-r--r--Documentation/devicetree/bindings/opp/opp.txt27
-rw-r--r--Documentation/devicetree/bindings/power/domain-idle-state.txt33
-rw-r--r--Documentation/devicetree/bindings/power/power_domain.txt43
-rw-r--r--Documentation/power/devices.txt14
-rw-r--r--Documentation/power/states.txt62
-rw-r--r--MAINTAINERS12
-rw-r--r--arch/arm/mach-imx/gpc.c17
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S9
-rw-r--r--arch/x86/power/hibernate_64.c94
-rw-r--r--drivers/acpi/processor_perflib.c55
-rw-r--r--drivers/acpi/sleep.c8
-rw-r--r--drivers/base/power/domain.c363
-rw-r--r--drivers/base/power/main.c2
-rw-r--r--drivers/base/power/opp/core.c521
-rw-r--r--drivers/base/power/opp/debugfs.c52
-rw-r--r--drivers/base/power/opp/of.c111
-rw-r--r--drivers/base/power/opp/opp.h23
-rw-r--r--drivers/base/power/power.h19
-rw-r--r--drivers/base/power/qos.c6
-rw-r--r--drivers/base/power/runtime.c62
-rw-r--r--drivers/base/power/sysfs.c6
-rw-r--r--drivers/base/power/wakeirq.c76
-rw-r--r--drivers/base/power/wakeup.c6
-rw-r--r--drivers/cpufreq/Kconfig.arm29
-rw-r--r--drivers/cpufreq/Makefile2
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c117
-rw-r--r--drivers/cpufreq/brcmstb-avs-cpufreq.c1057
-rw-r--r--drivers/cpufreq/cppc_cpufreq.c7
-rw-r--r--drivers/cpufreq/cpufreq-dt-platdev.c15
-rw-r--r--drivers/cpufreq/cpufreq-dt.c12
-rw-r--r--drivers/cpufreq/cpufreq.c25
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c46
-rw-r--r--drivers/cpufreq/cpufreq_governor.c30
-rw-r--r--drivers/cpufreq/cpufreq_governor.h5
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c17
-rw-r--r--drivers/cpufreq/cpufreq_stats.c22
-rw-r--r--drivers/cpufreq/integrator-cpufreq.c239
-rw-r--r--drivers/cpufreq/intel_pstate.c826
-rw-r--r--drivers/cpufreq/powernv-cpufreq.c65
-rw-r--r--drivers/cpuidle/cpuidle-powernv.c2
-rw-r--r--drivers/cpuidle/cpuidle.c19
-rw-r--r--drivers/cpuidle/dt_idle_states.c6
-rw-r--r--drivers/cpuidle/governor.c4
-rw-r--r--drivers/cpuidle/governors/ladder.c2
-rw-r--r--drivers/cpuidle/governors/menu.c2
-rw-r--r--drivers/cpuidle/sysfs.c4
-rw-r--r--drivers/devfreq/devfreq.c2
-rw-r--r--drivers/devfreq/event/exynos-nocp.c1
-rw-r--r--drivers/devfreq/event/exynos-ppmu.c6
-rw-r--r--drivers/devfreq/event/rockchip-dfi.c1
-rw-r--r--drivers/devfreq/exynos-bus.c29
-rw-r--r--drivers/devfreq/rk3399_dmc.c15
-rw-r--r--drivers/idle/intel_idle.c154
-rw-r--r--drivers/net/ethernet/smsc/smsc911x.c6
-rw-r--r--drivers/power/avs/rockchip-io-domain.c2
-rw-r--r--drivers/powercap/intel_rapl.c389
-rw-r--r--drivers/thermal/intel_powerclamp.c359
-rw-r--r--include/acpi/processor.h3
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/cpufreq.h6
-rw-r--r--include/linux/cpuidle.h9
-rw-r--r--include/linux/pm_domain.h28
-rw-r--r--include/linux/pm_opp.h72
-rw-r--r--include/linux/pm_runtime.h11
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/suspend.h2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/power/main.c88
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c119
-rw-r--r--kernel/sched/idle.c175
-rw-r--r--mm/kasan/kasan.c9
79 files changed, 4399 insertions, 1549 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 50b368d490b5..f523e5a3ac33 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -7,30 +7,35 @@ Description:
7 subsystem. 7 subsystem.
8 8
9What: /sys/power/state 9What: /sys/power/state
10Date: May 2014 10Date: November 2016
11Contact: Rafael J. Wysocki <rjw@rjwysocki.net> 11Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
12Description: 12Description:
13 The /sys/power/state file controls system sleep states. 13 The /sys/power/state file controls system sleep states.
14 Reading from this file returns the available sleep state 14 Reading from this file returns the available sleep state
15 labels, which may be "mem", "standby", "freeze" and "disk" 15 labels, which may be "mem" (suspend), "standby" (power-on
16 (hibernation). The meanings of the first three labels depend on 16 suspend), "freeze" (suspend-to-idle) and "disk" (hibernation).
17 the relative_sleep_states command line argument as follows: 17
18 1) relative_sleep_states = 1 18 Writing one of the above strings to this file causes the system
19 "mem", "standby", "freeze" represent non-hibernation sleep 19 to transition into the corresponding state, if available.
20 states from the deepest ("mem", always present) to the 20
21 shallowest ("freeze"). "standby" and "freeze" may or may 21 See Documentation/power/states.txt for more information.
22 not be present depending on the capabilities of the 22
23 platform. "freeze" can only be present if "standby" is 23What: /sys/power/mem_sleep
24 present. 24Date: November 2016
25 2) relative_sleep_states = 0 (default) 25Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
26 "mem" - "suspend-to-RAM", present if supported. 26Description:
27 "standby" - "power-on suspend", present if supported. 27 The /sys/power/mem_sleep file controls the operating mode of
28 "freeze" - "suspend-to-idle", always present. 28 system suspend. Reading from it returns the available modes
29 29 as "s2idle" (always present), "shallow" and "deep" (present if
30 Writing to this file one of these strings causes the system to 30 supported). The mode that will be used on subsequent attempts
31 transition into the corresponding state, if available. See 31 to suspend the system (by writing "mem" to the /sys/power/state
32 Documentation/power/states.txt for a description of what 32 file described above) is enclosed in square brackets.
33 "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. 33
34 Writing one of the above strings to this file causes the mode
35 represented by it to be used on subsequent attempts to suspend
36 the system.
37
38 See Documentation/power/states.txt for more information.
34 39
35What: /sys/power/disk 40What: /sys/power/disk
36Date: September 2006 41Date: September 2006
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 62d68b2056de..be2d6d0a03a4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1560,6 +1560,12 @@
1560 disable 1560 disable
1561 Do not enable intel_pstate as the default 1561 Do not enable intel_pstate as the default
1562 scaling driver for the supported processors 1562 scaling driver for the supported processors
1563 passive
1564 Use intel_pstate as a scaling driver, but configure it
1565 to work with generic cpufreq governors (instead of
1566 enabling its internal governor). This mode cannot be
1567 used along with the hardware-managed P-states (HWP)
1568 feature.
1563 force 1569 force
1564 Enable intel_pstate on systems that prohibit it by default 1570 Enable intel_pstate on systems that prohibit it by default
1565 in favor of acpi-cpufreq. Forcing the intel_pstate driver 1571 in favor of acpi-cpufreq. Forcing the intel_pstate driver
@@ -1580,6 +1586,9 @@
1580 Description Table, specifies preferred power management 1586 Description Table, specifies preferred power management
1581 profile as "Enterprise Server" or "Performance Server", 1587 profile as "Enterprise Server" or "Performance Server",
1582 then this feature is turned on by default. 1588 then this feature is turned on by default.
1589 per_cpu_perf_limits
1590 Allow per-logical-CPU P-State performance control limits using
1591 cpufreq sysfs interface
1583 1592
1584 intremap= [X86-64, Intel-IOMMU] 1593 intremap= [X86-64, Intel-IOMMU]
1585 on enable Interrupt Remapping (default) 1594 on enable Interrupt Remapping (default)
@@ -2122,6 +2131,12 @@
2122 memory contents and reserves bad memory 2131 memory contents and reserves bad memory
2123 regions that are detected. 2132 regions that are detected.
2124 2133
2134 mem_sleep_default= [SUSPEND] Default system suspend mode:
2135 s2idle - Suspend-To-Idle
2136 shallow - Power-On Suspend or equivalent (if supported)
2137 deep - Suspend-To-RAM or equivalent (if supported)
2138 See Documentation/power/states.txt.
2139
2125 meye.*= [HW] Set MotionEye Camera parameters 2140 meye.*= [HW] Set MotionEye Camera parameters
2126 See Documentation/video4linux/meye.txt. 2141 See Documentation/video4linux/meye.txt.
2127 2142
@@ -3475,13 +3490,6 @@
3475 [KNL, SMP] Set scheduler's default relax_domain_level. 3490 [KNL, SMP] Set scheduler's default relax_domain_level.
3476 See Documentation/cgroup-v1/cpusets.txt. 3491 See Documentation/cgroup-v1/cpusets.txt.
3477 3492
3478 relative_sleep_states=
3479 [SUSPEND] Use sleep state labeling where the deepest
3480 state available other than hibernation is always "mem".
3481 Format: { "0" | "1" }
3482 0 -- Traditional sleep state labels.
3483 1 -- Relative sleep state labels.
3484
3485 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 3493 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
3486 3494
3487 reservetop= [X86-32] 3495 reservetop= [X86-32]
diff --git a/Documentation/cpu-freq/cpufreq-stats.txt b/Documentation/cpu-freq/cpufreq-stats.txt
index 8d9773f23550..3c355f6ad834 100644
--- a/Documentation/cpu-freq/cpufreq-stats.txt
+++ b/Documentation/cpu-freq/cpufreq-stats.txt
@@ -44,11 +44,17 @@ the stats driver insertion.
44total 0 44total 0
45drwxr-xr-x 2 root root 0 May 14 16:06 . 45drwxr-xr-x 2 root root 0 May 14 16:06 .
46drwxr-xr-x 3 root root 0 May 14 15:58 .. 46drwxr-xr-x 3 root root 0 May 14 15:58 ..
47--w------- 1 root root 4096 May 14 16:06 reset
47-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state 48-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
48-r--r--r-- 1 root root 4096 May 14 16:06 total_trans 49-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
49-r--r--r-- 1 root root 4096 May 14 16:06 trans_table 50-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
50-------------------------------------------------------------------------------- 51--------------------------------------------------------------------------------
51 52
53- reset
54Write-only attribute that can be used to reset the stat counters. This can be
55useful for evaluating system behaviour under different governors without the
56need for a reboot.
57
52- time_in_state 58- time_in_state
53This gives the amount of time spent in each of the frequencies supported by 59This gives the amount of time spent in each of the frequencies supported by
54this CPU. The cat output will have "<frequency> <time>" pair in each line, which 60this CPU. The cat output will have "<frequency> <time>" pair in each line, which
diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index e6bd1e6512a5..1953994ef5e6 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -48,7 +48,7 @@ In addition to the frequency-controlling interfaces provided by the cpufreq
48core, the driver provides its own sysfs files to control the P-State selection. 48core, the driver provides its own sysfs files to control the P-State selection.
49These files have been added to /sys/devices/system/cpu/intel_pstate/. 49These files have been added to /sys/devices/system/cpu/intel_pstate/.
50Any changes made to these files are applicable to all CPUs (even in a 50Any changes made to these files are applicable to all CPUs (even in a
51multi-package system). 51multi-package system, Refer to later section on placing "Per-CPU limits").
52 52
53 max_perf_pct: Limits the maximum P-State that will be requested by 53 max_perf_pct: Limits the maximum P-State that will be requested by
54 the driver. It states it as a percentage of the available performance. The 54 the driver. It states it as a percentage of the available performance. The
@@ -120,13 +120,57 @@ frequency is fictional for Intel Core processors. Even if the scaling
120driver selects a single P-State, the actual frequency the processor 120driver selects a single P-State, the actual frequency the processor
121will run at is selected by the processor itself. 121will run at is selected by the processor itself.
122 122
123Per-CPU limits
124
125The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
126the intel_pstate driver to use per-CPU performance limits. When it is set,
127the sysfs control interface described above is subject to limitations.
128- The following controls are not available for both read and write
129 /sys/devices/system/cpu/intel_pstate/max_perf_pct
130 /sys/devices/system/cpu/intel_pstate/min_perf_pct
131- The following controls can be used to set performance limits, as far as the
132architecture of the processor permits:
133 /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
134 /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
135 /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
136- User can still observe turbo percent and number of P-States from
137 /sys/devices/system/cpu/intel_pstate/turbo_pct
138 /sys/devices/system/cpu/intel_pstate/num_pstates
139- User can read write system wide turbo status
140 /sys/devices/system/cpu/no_turbo
141
142Support of energy performance hints
143It is possible to provide hints to the HWP algorithms in the processor
144to be more performance centric to more energy centric. When the driver
145is using HWP, two additional cpufreq sysfs attributes are presented for
146each logical CPU.
147These attributes are:
148 - energy_performance_available_preferences
149 - energy_performance_preference
150
151To get list of supported hints:
152$ cat energy_performance_available_preferences
153 default performance balance_performance balance_power power
154
155The current preference can be read or changed via cpufreq sysfs
156attribute "energy_performance_preference". Reading from this attribute
157will display current effective setting. User can write any of the valid
158preference string to this attribute. User can always restore to power-on
159default by writing "default".
160
161Since threads can migrate to different CPUs, this is possible that the
162new CPU may have different energy performance preference than the previous
163one. To avoid such issues, either threads can be pinned to specific CPUs
164or set the same energy performance preference value to all CPUs.
165
123Tuning Intel P-State driver 166Tuning Intel P-State driver
124 167
125When HWP mode is not used, debugfs files have also been added to allow the 168When the performance can be tuned using PID (Proportional Integral
126tuning of the internal governor algorithm. These files are located at 169Derivative) controller, debugfs files are provided for adjusting performance.
127/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional 170They are presented under:
128Integral Derivative) controller. The PID tunable parameters are: 171/sys/kernel/debug/pstate_snb/
129 172
173The PID tunable parameters are:
130 deadband 174 deadband
131 d_gain_pct 175 d_gain_pct
132 i_gain_pct 176 i_gain_pct
diff --git a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
new file mode 100644
index 000000000000..af2385795d78
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
@@ -0,0 +1,78 @@
1Broadcom AVS mail box and interrupt register bindings
2=====================================================
3
4A total of three DT nodes are required. One node (brcm,avs-cpu-data-mem)
5references the mailbox register used to communicate with the AVS CPU[1]. The
6second node (brcm,avs-cpu-l2-intr) is required to trigger an interrupt on
7the AVS CPU. The interrupt tells the AVS CPU that it needs to process a
8command sent to it by a driver. Interrupting the AVS CPU is mandatory for
9commands to be processed.
10
11The interface also requires a reference to the AVS host interrupt controller,
12so a driver can react to interrupts generated by the AVS CPU whenever a command
13has been processed. See [2] for more information on the brcm,l2-intc node.
14
15[1] The AVS CPU is an independent co-processor that runs proprietary
16firmware. On some SoCs, this firmware supports DFS and DVFS in addition to
17Adaptive Voltage Scaling.
18
19[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt
20
21
22Node brcm,avs-cpu-data-mem
23--------------------------
24
25Required properties:
26- compatible: must include: brcm,avs-cpu-data-mem and
27 should include: one of brcm,bcm7271-avs-cpu-data-mem or
28 brcm,bcm7268-avs-cpu-data-mem
29- reg: Specifies base physical address and size of the registers.
30- interrupts: The interrupt that the AVS CPU will use to interrupt the host
31 when a command completed.
32- interrupt-parent: The interrupt controller the above interrupt is routed
33 through.
34- interrupt-names: The name of the interrupt used to interrupt the host.
35
36Optional properties:
37- None
38
39Node brcm,avs-cpu-l2-intr
40-------------------------
41
42Required properties:
43- compatible: must include: brcm,avs-cpu-l2-intr and
44 should include: one of brcm,bcm7271-avs-cpu-l2-intr or
45 brcm,bcm7268-avs-cpu-l2-intr
46- reg: Specifies base physical address and size of the registers.
47
48Optional properties:
49- None
50
51
52Example
53=======
54
55 avs_host_l2_intc: interrupt-controller@f04d1200 {
56 #interrupt-cells = <1>;
57 compatible = "brcm,l2-intc";
58 interrupt-parent = <&intc>;
59 reg = <0xf04d1200 0x48>;
60 interrupt-controller;
61 interrupts = <0x0 0x19 0x0>;
62 interrupt-names = "avs";
63 };
64
65 avs-cpu-data-mem@f04c4000 {
66 compatible = "brcm,bcm7271-avs-cpu-data-mem",
67 "brcm,avs-cpu-data-mem";
68 reg = <0xf04c4000 0x60>;
69 interrupts = <0x1a>;
70 interrupt-parent = <&avs_host_l2_intc>;
71 interrupt-names = "sw_intr";
72 };
73
74 avs-cpu-l2-intr@f04d1100 {
75 compatible = "brcm,bcm7271-avs-cpu-l2-intr",
76 "brcm,avs-cpu-l2-intr";
77 reg = <0xf04d1100 0x10>;
78 };
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index ee91cbdd95ee..9f5ca4457b5f 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -86,8 +86,14 @@ Optional properties:
86 Single entry is for target voltage and three entries are for <target min max> 86 Single entry is for target voltage and three entries are for <target min max>
87 voltages. 87 voltages.
88 88
89 Entries for multiple regulators must be present in the same order as 89 Entries for multiple regulators shall be provided in the same field separated
90 regulators are specified in device's DT node. 90 by angular brackets <>. The OPP binding doesn't provide any provisions to
91 relate the values to their power supplies or the order in which the supplies
92 need to be configured and that is left for the implementation specific
93 binding.
94
95 Entries for all regulators shall be of the same size, i.e. either all use a
96 single value or triplets.
91 97
92- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to 98- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
93 the above opp-microvolt property, but allows multiple voltage ranges to be 99 the above opp-microvolt property, but allows multiple voltage ranges to be
@@ -104,10 +110,13 @@ Optional properties:
104 110
105 Should only be set if opp-microvolt is set for the OPP. 111 Should only be set if opp-microvolt is set for the OPP.
106 112
107 Entries for multiple regulators must be present in the same order as 113 Entries for multiple regulators shall be provided in the same field separated
108 regulators are specified in device's DT node. If this property isn't required 114 by angular brackets <>. If current values aren't required for a regulator,
109 for few regulators, then this should be marked as zero for them. If it isn't 115 then it shall be filled with 0. If current values aren't required for any of
110 required for any regulator, then this property need not be present. 116 the regulators, then this field is not required. The OPP binding doesn't
117 provide any provisions to relate the values to their power supplies or the
118 order in which the supplies need to be configured and that is left for the
119 implementation specific binding.
111 120
112- opp-microamp-<name>: Named opp-microamp property. Similar to 121- opp-microamp-<name>: Named opp-microamp property. Similar to
113 opp-microvolt-<name> property, but for microamp instead. 122 opp-microvolt-<name> property, but for microamp instead.
@@ -386,10 +395,12 @@ Example 4: Handling multiple regulators
386/ { 395/ {
387 cpus { 396 cpus {
388 cpu@0 { 397 cpu@0 {
389 compatible = "arm,cortex-a7"; 398 compatible = "vendor,cpu-type";
390 ... 399 ...
391 400
392 cpu-supply = <&cpu_supply0>, <&cpu_supply1>, <&cpu_supply2>; 401 vcc0-supply = <&cpu_supply0>;
402 vcc1-supply = <&cpu_supply1>;
403 vcc2-supply = <&cpu_supply2>;
393 operating-points-v2 = <&cpu0_opp_table>; 404 operating-points-v2 = <&cpu0_opp_table>;
394 }; 405 };
395 }; 406 };
diff --git a/Documentation/devicetree/bindings/power/domain-idle-state.txt b/Documentation/devicetree/bindings/power/domain-idle-state.txt
new file mode 100644
index 000000000000..eefc7ed22ca2
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/domain-idle-state.txt
@@ -0,0 +1,33 @@
1PM Domain Idle State Node:
2
3A domain idle state node represents the state parameters that will be used to
4select the state when there are no active components in the domain.
5
6The state node has the following parameters -
7
8- compatible:
9 Usage: Required
10 Value type: <string>
11 Definition: Must be "domain-idle-state".
12
13- entry-latency-us
14 Usage: Required
15 Value type: <prop-encoded-array>
16 Definition: u32 value representing worst case latency in
17 microseconds required to enter the idle state.
18 The exit-latency-us duration may be guaranteed
19 only after entry-latency-us has passed.
20
21- exit-latency-us
22 Usage: Required
23 Value type: <prop-encoded-array>
24 Definition: u32 value representing worst case latency
25 in microseconds required to exit the idle state.
26
27- min-residency-us
28 Usage: Required
29 Value type: <prop-encoded-array>
30 Definition: u32 value representing minimum residency duration
31 in microseconds after which the idle state will yield
32 power benefits after overcoming the overhead in entering
33i the idle state.
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 025b5e7df61c..723e1ad937da 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -29,6 +29,15 @@ Optional properties:
29 specified by this binding. More details about power domain specifier are 29 specified by this binding. More details about power domain specifier are
30 available in the next section. 30 available in the next section.
31 31
32- domain-idle-states : A phandle of an idle-state that shall be soaked into a
33 generic domain power state. The idle state definitions are
34 compatible with domain-idle-state specified in [1].
35 The domain-idle-state property reflects the idle state of this PM domain and
36 not the idle states of the devices or sub-domains in the PM domain. Devices
37 and sub-domains have their own idle-states independent of the parent
38 domain's idle states. In the absence of this property, the domain would be
39 considered as capable of being powered-on or powered-off.
40
32Example: 41Example:
33 42
34 power: power-controller@12340000 { 43 power: power-controller@12340000 {
@@ -59,6 +68,38 @@ The nodes above define two power controllers: 'parent' and 'child'.
59Domains created by the 'child' power controller are subdomains of '0' power 68Domains created by the 'child' power controller are subdomains of '0' power
60domain provided by the 'parent' power controller. 69domain provided by the 'parent' power controller.
61 70
71Example 3:
72 parent: power-controller@12340000 {
73 compatible = "foo,power-controller";
74 reg = <0x12340000 0x1000>;
75 #power-domain-cells = <0>;
76 domain-idle-states = <&DOMAIN_RET>, <&DOMAIN_PWR_DN>;
77 };
78
79 child: power-controller@12341000 {
80 compatible = "foo,power-controller";
81 reg = <0x12341000 0x1000>;
82 power-domains = <&parent 0>;
83 #power-domain-cells = <0>;
84 domain-idle-states = <&DOMAIN_PWR_DN>;
85 };
86
87 DOMAIN_RET: state@0 {
88 compatible = "domain-idle-state";
89 reg = <0x0>;
90 entry-latency-us = <1000>;
91 exit-latency-us = <2000>;
92 min-residency-us = <10000>;
93 };
94
95 DOMAIN_PWR_DN: state@1 {
96 compatible = "domain-idle-state";
97 reg = <0x1>;
98 entry-latency-us = <5000>;
99 exit-latency-us = <8000>;
100 min-residency-us = <7000>;
101 };
102
62==PM domain consumers== 103==PM domain consumers==
63 104
64Required properties: 105Required properties:
@@ -76,3 +117,5 @@ Example:
76The node above defines a typical PM domain consumer device, which is located 117The node above defines a typical PM domain consumer device, which is located
77inside a PM domain with index 0 of a power controller represented by a node 118inside a PM domain with index 0 of a power controller represented by a node
78with the label "power". 119with the label "power".
120
121[1]. Documentation/devicetree/bindings/power/domain-idle-state.txt
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 8ba6625fdd63..73ddea39a9ce 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -607,7 +607,9 @@ individually. Instead, a set of devices sharing a power resource can be put
607into a low-power state together at the same time by turning off the shared 607into a low-power state together at the same time by turning off the shared
608power resource. Of course, they also need to be put into the full-power state 608power resource. Of course, they also need to be put into the full-power state
609together, by turning the shared power resource on. A set of devices with this 609together, by turning the shared power resource on. A set of devices with this
610property is often referred to as a power domain. 610property is often referred to as a power domain. A power domain may also be
611nested inside another power domain. The nested domain is referred to as the
612sub-domain of the parent domain.
611 613
612Support for power domains is provided through the pm_domain field of struct 614Support for power domains is provided through the pm_domain field of struct
613device. This field is a pointer to an object of type struct dev_pm_domain, 615device. This field is a pointer to an object of type struct dev_pm_domain,
@@ -629,6 +631,16 @@ support for power domains into subsystem-level callbacks, for example by
629modifying the platform bus type. Other platforms need not implement it or take 631modifying the platform bus type. Other platforms need not implement it or take
630it into account in any way. 632it into account in any way.
631 633
634Devices may be defined as IRQ-safe which indicates to the PM core that their
635runtime PM callbacks may be invoked with disabled interrupts (see
636Documentation/power/runtime_pm.txt for more information). If an IRQ-safe
637device belongs to a PM domain, the runtime PM of the domain will be
638disallowed, unless the domain itself is defined as IRQ-safe. However, it
639makes sense to define a PM domain as IRQ-safe only if all the devices in it
640are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime
641PM of the parent is only allowed if the parent itself is IRQ-safe too with the
642additional restriction that all child domains of an IRQ-safe parent must also
643be IRQ-safe.
632 644
633Device Low Power (suspend) States 645Device Low Power (suspend) States
634--------------------------------- 646---------------------------------
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 50f3ef9177c1..8a39ce45d8a0 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -8,25 +8,43 @@ for each state.
8 8
9The states are represented by strings that can be read or written to the 9The states are represented by strings that can be read or written to the
10/sys/power/state file. Those strings may be "mem", "standby", "freeze" and 10/sys/power/state file. Those strings may be "mem", "standby", "freeze" and
11"disk", where the last one always represents hibernation (Suspend-To-Disk) and 11"disk", where the last three always represent Power-On Suspend (if supported),
12the meaning of the remaining ones depends on the relative_sleep_states command 12Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively.
13line argument. 13
14 14The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file.
15For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the 15It contains strings representing the available modes of system suspend that may
16available non-hibernation sleep states from the deepest to the shallowest, 16be triggered by writing "mem" to /sys/power/state. These modes are "s2idle"
17respectively. In that case, "mem" is always present in /sys/power/state, 17(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM).
18because there is at least one non-hibernation sleep state in every system. If 18The "s2idle" mode is always available, while the other ones are only available
19the given system supports two non-hibernation sleep states, "standby" is present 19if supported by the platform (if not supported, the strings representing them
20in /sys/power/state in addition to "mem". If the system supports three 20are not present in /sys/power/mem_sleep). The string representing the suspend
21non-hibernation sleep states, "freeze" will be present in /sys/power/state in 21mode to be used subsequently is enclosed in square brackets. Writing one of
22addition to "mem" and "standby". 22the other strings present in /sys/power/mem_sleep to it causes the suspend mode
23 23to be used subsequently to change to the one represented by that string.
24For relative_sleep_states=0, which is the default, the following descriptions 24
25apply. 25Consequently, there are two ways to cause the system to go into the
26 26Suspend-To-Idle sleep state. The first one is to write "freeze" directly to
27state: Suspend-To-Idle 27/sys/power/state. The second one is to write "s2idle" to /sys/power/mem_sleep
28and then to wrtie "mem" to /sys/power/state. Similarly, there are two ways
29to cause the system to go into the Power-On Suspend sleep state (the strings to
30write to the control files in that case are "standby" or "shallow" and "mem",
31respectively) if that state is supported by the platform. In turn, there is
32only one way to cause the system to go into the Suspend-To-RAM state (write
33"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state).
34
35The default suspend mode (ie. the one to be used without writing anything into
36/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or
37"s2idle", but it can be overridden by the value of the "mem_sleep_default"
38parameter in the kernel command line. On some ACPI-based systems, depending on
39the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM
40is supported.
41
42The properties of all of the sleep states are described below.
43
44
45State: Suspend-To-Idle
28ACPI state: S0 46ACPI state: S0
29Label: "freeze" 47Label: "s2idle" ("freeze")
30 48
31This state is a generic, pure software, light-weight, system sleep state. 49This state is a generic, pure software, light-weight, system sleep state.
32It allows more energy to be saved relative to runtime idle by freezing user 50It allows more energy to be saved relative to runtime idle by freezing user
@@ -35,13 +53,13 @@ lower-power than available at run time), such that the processors can
35spend more time in their idle states. 53spend more time in their idle states.
36 54
37This state can be used for platforms without Power-On Suspend/Suspend-to-RAM 55This state can be used for platforms without Power-On Suspend/Suspend-to-RAM
38support, or it can be used in addition to Suspend-to-RAM (memory sleep) 56support, or it can be used in addition to Suspend-to-RAM to provide reduced
39to provide reduced resume latency. It is always supported. 57resume latency. It is always supported.
40 58
41 59
42State: Standby / Power-On Suspend 60State: Standby / Power-On Suspend
43ACPI State: S1 61ACPI State: S1
44Label: "standby" 62Label: "shallow" ("standby")
45 63
46This state, if supported, offers moderate, though real, power savings, while 64This state, if supported, offers moderate, though real, power savings, while
47providing a relatively low-latency transition back to a working system. No 65providing a relatively low-latency transition back to a working system. No
@@ -58,7 +76,7 @@ state.
58 76
59State: Suspend-to-RAM 77State: Suspend-to-RAM
60ACPI State: S3 78ACPI State: S3
61Label: "mem" 79Label: "deep"
62 80
63This state, if supported, offers significant power savings as everything in the 81This state, if supported, offers significant power savings as everything in the
64system is put into a low-power state, except for memory, which should be placed 82system is put into a low-power state, except for memory, which should be placed
diff --git a/MAINTAINERS b/MAINTAINERS
index 3d7d66cdb44c..34ef63763566 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2764,6 +2764,14 @@ L: bcm-kernel-feedback-list@broadcom.com
2764S: Maintained 2764S: Maintained
2765F: drivers/mtd/nand/brcmnand/ 2765F: drivers/mtd/nand/brcmnand/
2766 2766
2767BROADCOM STB AVS CPUFREQ DRIVER
2768M: Markus Mayer <mmayer@broadcom.com>
2769M: bcm-kernel-feedback-list@broadcom.com
2770L: linux-pm@vger.kernel.org
2771S: Maintained
2772F: Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
2773F: drivers/cpufreq/brcmstb*
2774
2767BROADCOM SPECIFIC AMBA DRIVER (BCMA) 2775BROADCOM SPECIFIC AMBA DRIVER (BCMA)
2768M: Rafał Miłecki <zajec5@gmail.com> 2776M: Rafał Miłecki <zajec5@gmail.com>
2769L: linux-wireless@vger.kernel.org 2777L: linux-wireless@vger.kernel.org
@@ -3356,6 +3364,7 @@ L: linux-pm@vger.kernel.org
3356S: Maintained 3364S: Maintained
3357T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git 3365T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
3358T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates) 3366T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
3367B: https://bugzilla.kernel.org
3359F: Documentation/cpu-freq/ 3368F: Documentation/cpu-freq/
3360F: drivers/cpufreq/ 3369F: drivers/cpufreq/
3361F: include/linux/cpufreq.h 3370F: include/linux/cpufreq.h
@@ -3395,6 +3404,7 @@ M: Daniel Lezcano <daniel.lezcano@linaro.org>
3395L: linux-pm@vger.kernel.org 3404L: linux-pm@vger.kernel.org
3396S: Maintained 3405S: Maintained
3397T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git 3406T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
3407B: https://bugzilla.kernel.org
3398F: drivers/cpuidle/* 3408F: drivers/cpuidle/*
3399F: include/linux/cpuidle.h 3409F: include/linux/cpuidle.h
3400 3410
@@ -6362,9 +6372,11 @@ S: Maintained
6362F: drivers/platform/x86/intel-vbtn.c 6372F: drivers/platform/x86/intel-vbtn.c
6363 6373
6364INTEL IDLE DRIVER 6374INTEL IDLE DRIVER
6375M: Jacob Pan <jacob.jun.pan@linux.intel.com>
6365M: Len Brown <lenb@kernel.org> 6376M: Len Brown <lenb@kernel.org>
6366L: linux-pm@vger.kernel.org 6377L: linux-pm@vger.kernel.org
6367T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git 6378T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
6379B: https://bugzilla.kernel.org
6368S: Supported 6380S: Supported
6369F: drivers/idle/intel_idle.c 6381F: drivers/idle/intel_idle.c
6370 6382
diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index b54db47f6f32..1dc2a34b9dbd 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -380,13 +380,6 @@ static struct pu_domain imx6q_pu_domain = {
380 .name = "PU", 380 .name = "PU",
381 .power_off = imx6q_pm_pu_power_off, 381 .power_off = imx6q_pm_pu_power_off,
382 .power_on = imx6q_pm_pu_power_on, 382 .power_on = imx6q_pm_pu_power_on,
383 .states = {
384 [0] = {
385 .power_off_latency_ns = 25000,
386 .power_on_latency_ns = 2000000,
387 },
388 },
389 .state_count = 1,
390 }, 383 },
391}; 384};
392 385
@@ -430,6 +423,16 @@ static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg)
430 if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) 423 if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
431 return 0; 424 return 0;
432 425
426 imx6q_pu_domain.base.states = devm_kzalloc(dev,
427 sizeof(*imx6q_pu_domain.base.states),
428 GFP_KERNEL);
429 if (!imx6q_pu_domain.base.states)
430 return -ENOMEM;
431
432 imx6q_pu_domain.base.states[0].power_off_latency_ns = 25000;
433 imx6q_pu_domain.base.states[0].power_on_latency_ns = 2000000;
434 imx6q_pu_domain.base.state_count = 1;
435
433 for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++) 436 for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++)
434 pm_genpd_init(imx_gpc_domains[i], NULL, false); 437 pm_genpd_init(imx_gpc_domains[i], NULL, false);
435 438
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
109 movq pt_regs_r14(%rax), %r14 109 movq pt_regs_r14(%rax), %r14
110 movq pt_regs_r15(%rax), %r15 110 movq pt_regs_r15(%rax), %r15
111 111
112#ifdef CONFIG_KASAN
113 /*
114 * The suspend path may have poisoned some areas deeper in the stack,
115 * which we now need to unpoison.
116 */
117 movq %rsp, %rdi
118 call kasan_unpoison_task_stack_below
119#endif
120
112 xorl %eax, %eax 121 xorl %eax, %eax
113 addq $8, %rsp 122 addq $8, %rsp
114 FRAME_END 123 FRAME_END
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 9634557a5444..ded2e8272382 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,10 @@
11#include <linux/gfp.h> 11#include <linux/gfp.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/scatterlist.h>
15#include <linux/kdebug.h>
16
17#include <crypto/hash.h>
14 18
15#include <asm/init.h> 19#include <asm/init.h>
16#include <asm/proto.h> 20#include <asm/proto.h>
@@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn)
177 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); 181 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
178} 182}
179 183
184#define MD5_DIGEST_SIZE 16
185
180struct restore_data_record { 186struct restore_data_record {
181 unsigned long jump_address; 187 unsigned long jump_address;
182 unsigned long jump_address_phys; 188 unsigned long jump_address_phys;
183 unsigned long cr3; 189 unsigned long cr3;
184 unsigned long magic; 190 unsigned long magic;
191 u8 e820_digest[MD5_DIGEST_SIZE];
185}; 192};
186 193
187#define RESTORE_MAGIC 0x123456789ABCDEF0UL 194#define RESTORE_MAGIC 0x23456789ABCDEF01UL
195
196#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
197/**
198 * get_e820_md5 - calculate md5 according to given e820 map
199 *
200 * @map: the e820 map to be calculated
201 * @buf: the md5 result to be stored to
202 */
203static int get_e820_md5(struct e820map *map, void *buf)
204{
205 struct scatterlist sg;
206 struct crypto_ahash *tfm;
207 int size;
208 int ret = 0;
209
210 tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
211 if (IS_ERR(tfm))
212 return -ENOMEM;
213
214 {
215 AHASH_REQUEST_ON_STACK(req, tfm);
216 size = offsetof(struct e820map, map)
217 + sizeof(struct e820entry) * map->nr_map;
218 ahash_request_set_tfm(req, tfm);
219 sg_init_one(&sg, (u8 *)map, size);
220 ahash_request_set_callback(req, 0, NULL, NULL);
221 ahash_request_set_crypt(req, &sg, buf, size);
222
223 if (crypto_ahash_digest(req))
224 ret = -EINVAL;
225 ahash_request_zero(req);
226 }
227 crypto_free_ahash(tfm);
228
229 return ret;
230}
231
232static void hibernation_e820_save(void *buf)
233{
234 get_e820_md5(e820_saved, buf);
235}
236
237static bool hibernation_e820_mismatch(void *buf)
238{
239 int ret;
240 u8 result[MD5_DIGEST_SIZE];
241
242 memset(result, 0, MD5_DIGEST_SIZE);
243 /* If there is no digest in suspend kernel, let it go. */
244 if (!memcmp(result, buf, MD5_DIGEST_SIZE))
245 return false;
246
247 ret = get_e820_md5(e820_saved, result);
248 if (ret)
249 return true;
250
251 return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
252}
253#else
254static void hibernation_e820_save(void *buf)
255{
256}
257
258static bool hibernation_e820_mismatch(void *buf)
259{
260 /* If md5 is not builtin for restore kernel, let it go. */
261 return false;
262}
263#endif
188 264
189/** 265/**
190 * arch_hibernation_header_save - populate the architecture specific part 266 * arch_hibernation_header_save - populate the architecture specific part
@@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
201 rdr->jump_address_phys = __pa_symbol(&restore_registers); 277 rdr->jump_address_phys = __pa_symbol(&restore_registers);
202 rdr->cr3 = restore_cr3; 278 rdr->cr3 = restore_cr3;
203 rdr->magic = RESTORE_MAGIC; 279 rdr->magic = RESTORE_MAGIC;
280
281 hibernation_e820_save(rdr->e820_digest);
282
204 return 0; 283 return 0;
205} 284}
206 285
@@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr)
216 restore_jump_address = rdr->jump_address; 295 restore_jump_address = rdr->jump_address;
217 jump_address_phys = rdr->jump_address_phys; 296 jump_address_phys = rdr->jump_address_phys;
218 restore_cr3 = rdr->cr3; 297 restore_cr3 = rdr->cr3;
219 return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; 298
299 if (rdr->magic != RESTORE_MAGIC) {
300 pr_crit("Unrecognized hibernate image header format!\n");
301 return -EINVAL;
302 }
303
304 if (hibernation_e820_mismatch(rdr->e820_digest)) {
305 pr_crit("Hibernate inconsistent memory map detected!\n");
306 return -ENODEV;
307 }
308
309 return 0;
220} 310}
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index bb01dea39fdc..f0b4a981b8d3 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -157,7 +157,7 @@ static void acpi_processor_ppc_ost(acpi_handle handle, int status)
157 status, NULL); 157 status, NULL);
158} 158}
159 159
160int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) 160void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
161{ 161{
162 int ret; 162 int ret;
163 163
@@ -168,7 +168,7 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
168 */ 168 */
169 if (event_flag) 169 if (event_flag)
170 acpi_processor_ppc_ost(pr->handle, 1); 170 acpi_processor_ppc_ost(pr->handle, 1);
171 return 0; 171 return;
172 } 172 }
173 173
174 ret = acpi_processor_get_platform_limit(pr); 174 ret = acpi_processor_get_platform_limit(pr);
@@ -182,10 +182,8 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
182 else 182 else
183 acpi_processor_ppc_ost(pr->handle, 0); 183 acpi_processor_ppc_ost(pr->handle, 0);
184 } 184 }
185 if (ret < 0) 185 if (ret >= 0)
186 return (ret); 186 cpufreq_update_policy(pr->id);
187 else
188 return cpufreq_update_policy(pr->id);
189} 187}
190 188
191int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) 189int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@ -465,11 +463,33 @@ int acpi_processor_get_performance_info(struct acpi_processor *pr)
465 return result; 463 return result;
466} 464}
467EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info); 465EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info);
468int acpi_processor_notify_smm(struct module *calling_module) 466
467int acpi_processor_pstate_control(void)
469{ 468{
470 acpi_status status; 469 acpi_status status;
471 static int is_done = 0;
472 470
471 if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control)
472 return 0;
473
474 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
475 "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
476 acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
477
478 status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
479 (u32)acpi_gbl_FADT.pstate_control, 8);
480 if (ACPI_SUCCESS(status))
481 return 1;
482
483 ACPI_EXCEPTION((AE_INFO, status,
484 "Failed to write pstate_control [0x%x] to smi_command [0x%x]",
485 acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
486 return -EIO;
487}
488
489int acpi_processor_notify_smm(struct module *calling_module)
490{
491 static int is_done = 0;
492 int result;
473 493
474 if (!(acpi_processor_ppc_status & PPC_REGISTERED)) 494 if (!(acpi_processor_ppc_status & PPC_REGISTERED))
475 return -EBUSY; 495 return -EBUSY;
@@ -492,26 +512,15 @@ int acpi_processor_notify_smm(struct module *calling_module)
492 512
493 is_done = -EIO; 513 is_done = -EIO;
494 514
495 /* Can't write pstate_control to smi_command if either value is zero */ 515 result = acpi_processor_pstate_control();
496 if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) { 516 if (!result) {
497 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n")); 517 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n"));
498 module_put(calling_module); 518 module_put(calling_module);
499 return 0; 519 return 0;
500 } 520 }
501 521 if (result < 0) {
502 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
503 "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
504 acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
505
506 status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
507 (u32) acpi_gbl_FADT.pstate_control, 8);
508 if (ACPI_FAILURE(status)) {
509 ACPI_EXCEPTION((AE_INFO, status,
510 "Failed to write pstate_control [0x%x] to "
511 "smi_command [0x%x]", acpi_gbl_FADT.pstate_control,
512 acpi_gbl_FADT.smi_command));
513 module_put(calling_module); 522 module_put(calling_module);
514 return status; 523 return result;
515 } 524 }
516 525
517 /* Success. If there's no _PPC, we need to fear nothing, so 526 /* Success. If there's no _PPC, we need to fear nothing, so
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 54abb26b7366..9b6cebe227a0 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -674,6 +674,14 @@ static void acpi_sleep_suspend_setup(void)
674 if (acpi_sleep_state_supported(i)) 674 if (acpi_sleep_state_supported(i))
675 sleep_states[i] = 1; 675 sleep_states[i] = 1;
676 676
677 /*
678 * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and
679 * the default suspend mode was not selected from the command line.
680 */
681 if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 &&
682 mem_sleep_default > PM_SUSPEND_MEM)
683 mem_sleep_default = PM_SUSPEND_FREEZE;
684
677 suspend_set_ops(old_suspend_ordering ? 685 suspend_set_ops(old_suspend_ordering ?
678 &acpi_suspend_ops_old : &acpi_suspend_ops); 686 &acpi_suspend_ops_old : &acpi_suspend_ops);
679 freeze_set_ops(&acpi_freeze_ops); 687 freeze_set_ops(&acpi_freeze_ops);
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e023066e4215..5711708532db 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -39,6 +39,105 @@
39static LIST_HEAD(gpd_list); 39static LIST_HEAD(gpd_list);
40static DEFINE_MUTEX(gpd_list_lock); 40static DEFINE_MUTEX(gpd_list_lock);
41 41
42struct genpd_lock_ops {
43 void (*lock)(struct generic_pm_domain *genpd);
44 void (*lock_nested)(struct generic_pm_domain *genpd, int depth);
45 int (*lock_interruptible)(struct generic_pm_domain *genpd);
46 void (*unlock)(struct generic_pm_domain *genpd);
47};
48
49static void genpd_lock_mtx(struct generic_pm_domain *genpd)
50{
51 mutex_lock(&genpd->mlock);
52}
53
54static void genpd_lock_nested_mtx(struct generic_pm_domain *genpd,
55 int depth)
56{
57 mutex_lock_nested(&genpd->mlock, depth);
58}
59
60static int genpd_lock_interruptible_mtx(struct generic_pm_domain *genpd)
61{
62 return mutex_lock_interruptible(&genpd->mlock);
63}
64
65static void genpd_unlock_mtx(struct generic_pm_domain *genpd)
66{
67 return mutex_unlock(&genpd->mlock);
68}
69
70static const struct genpd_lock_ops genpd_mtx_ops = {
71 .lock = genpd_lock_mtx,
72 .lock_nested = genpd_lock_nested_mtx,
73 .lock_interruptible = genpd_lock_interruptible_mtx,
74 .unlock = genpd_unlock_mtx,
75};
76
77static void genpd_lock_spin(struct generic_pm_domain *genpd)
78 __acquires(&genpd->slock)
79{
80 unsigned long flags;
81
82 spin_lock_irqsave(&genpd->slock, flags);
83 genpd->lock_flags = flags;
84}
85
86static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
87 int depth)
88 __acquires(&genpd->slock)
89{
90 unsigned long flags;
91
92 spin_lock_irqsave_nested(&genpd->slock, flags, depth);
93 genpd->lock_flags = flags;
94}
95
96static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
97 __acquires(&genpd->slock)
98{
99 unsigned long flags;
100
101 spin_lock_irqsave(&genpd->slock, flags);
102 genpd->lock_flags = flags;
103 return 0;
104}
105
106static void genpd_unlock_spin(struct generic_pm_domain *genpd)
107 __releases(&genpd->slock)
108{
109 spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
110}
111
112static const struct genpd_lock_ops genpd_spin_ops = {
113 .lock = genpd_lock_spin,
114 .lock_nested = genpd_lock_nested_spin,
115 .lock_interruptible = genpd_lock_interruptible_spin,
116 .unlock = genpd_unlock_spin,
117};
118
119#define genpd_lock(p) p->lock_ops->lock(p)
120#define genpd_lock_nested(p, d) p->lock_ops->lock_nested(p, d)
121#define genpd_lock_interruptible(p) p->lock_ops->lock_interruptible(p)
122#define genpd_unlock(p) p->lock_ops->unlock(p)
123
124#define genpd_is_irq_safe(genpd) (genpd->flags & GENPD_FLAG_IRQ_SAFE)
125
126static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
127 struct generic_pm_domain *genpd)
128{
129 bool ret;
130
131 ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd);
132
133 /* Warn once for each IRQ safe dev in no sleep domain */
134 if (ret)
135 dev_warn_once(dev, "PM domain %s will not be powered off\n",
136 genpd->name);
137
138 return ret;
139}
140
42/* 141/*
43 * Get the generic PM domain for a particular struct device. 142 * Get the generic PM domain for a particular struct device.
44 * This validates the struct device pointer, the PM domain pointer, 143 * This validates the struct device pointer, the PM domain pointer,
@@ -200,9 +299,9 @@ static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth)
200 299
201 genpd_sd_counter_inc(master); 300 genpd_sd_counter_inc(master);
202 301
203 mutex_lock_nested(&master->lock, depth + 1); 302 genpd_lock_nested(master, depth + 1);
204 ret = genpd_poweron(master, depth + 1); 303 ret = genpd_poweron(master, depth + 1);
205 mutex_unlock(&master->lock); 304 genpd_unlock(master);
206 305
207 if (ret) { 306 if (ret) {
208 genpd_sd_counter_dec(master); 307 genpd_sd_counter_dec(master);
@@ -255,9 +354,9 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
255 spin_unlock_irq(&dev->power.lock); 354 spin_unlock_irq(&dev->power.lock);
256 355
257 if (!IS_ERR(genpd)) { 356 if (!IS_ERR(genpd)) {
258 mutex_lock(&genpd->lock); 357 genpd_lock(genpd);
259 genpd->max_off_time_changed = true; 358 genpd->max_off_time_changed = true;
260 mutex_unlock(&genpd->lock); 359 genpd_unlock(genpd);
261 } 360 }
262 361
263 dev = dev->parent; 362 dev = dev->parent;
@@ -303,7 +402,12 @@ static int genpd_poweroff(struct generic_pm_domain *genpd, bool is_async)
303 if (stat > PM_QOS_FLAGS_NONE) 402 if (stat > PM_QOS_FLAGS_NONE)
304 return -EBUSY; 403 return -EBUSY;
305 404
306 if (!pm_runtime_suspended(pdd->dev) || pdd->dev->power.irq_safe) 405 /*
406 * Do not allow PM domain to be powered off, when an IRQ safe
407 * device is part of a non-IRQ safe domain.
408 */
409 if (!pm_runtime_suspended(pdd->dev) ||
410 irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd))
307 not_suspended++; 411 not_suspended++;
308 } 412 }
309 413
@@ -354,9 +458,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
354 458
355 genpd = container_of(work, struct generic_pm_domain, power_off_work); 459 genpd = container_of(work, struct generic_pm_domain, power_off_work);
356 460
357 mutex_lock(&genpd->lock); 461 genpd_lock(genpd);
358 genpd_poweroff(genpd, true); 462 genpd_poweroff(genpd, true);
359 mutex_unlock(&genpd->lock); 463 genpd_unlock(genpd);
360} 464}
361 465
362/** 466/**
@@ -466,15 +570,15 @@ static int genpd_runtime_suspend(struct device *dev)
466 } 570 }
467 571
468 /* 572 /*
469 * If power.irq_safe is set, this routine will be run with interrupts 573 * If power.irq_safe is set, this routine may be run with
470 * off, so it can't use mutexes. 574 * IRQs disabled, so suspend only if the PM domain also is irq_safe.
471 */ 575 */
472 if (dev->power.irq_safe) 576 if (irq_safe_dev_in_no_sleep_domain(dev, genpd))
473 return 0; 577 return 0;
474 578
475 mutex_lock(&genpd->lock); 579 genpd_lock(genpd);
476 genpd_poweroff(genpd, false); 580 genpd_poweroff(genpd, false);
477 mutex_unlock(&genpd->lock); 581 genpd_unlock(genpd);
478 582
479 return 0; 583 return 0;
480} 584}
@@ -503,15 +607,18 @@ static int genpd_runtime_resume(struct device *dev)
503 if (IS_ERR(genpd)) 607 if (IS_ERR(genpd))
504 return -EINVAL; 608 return -EINVAL;
505 609
506 /* If power.irq_safe, the PM domain is never powered off. */ 610 /*
507 if (dev->power.irq_safe) { 611 * As we don't power off a non IRQ safe domain, which holds
612 * an IRQ safe device, we don't need to restore power to it.
613 */
614 if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) {
508 timed = false; 615 timed = false;
509 goto out; 616 goto out;
510 } 617 }
511 618
512 mutex_lock(&genpd->lock); 619 genpd_lock(genpd);
513 ret = genpd_poweron(genpd, 0); 620 ret = genpd_poweron(genpd, 0);
514 mutex_unlock(&genpd->lock); 621 genpd_unlock(genpd);
515 622
516 if (ret) 623 if (ret)
517 return ret; 624 return ret;
@@ -546,10 +653,11 @@ static int genpd_runtime_resume(struct device *dev)
546err_stop: 653err_stop:
547 genpd_stop_dev(genpd, dev); 654 genpd_stop_dev(genpd, dev);
548err_poweroff: 655err_poweroff:
549 if (!dev->power.irq_safe) { 656 if (!pm_runtime_is_irq_safe(dev) ||
550 mutex_lock(&genpd->lock); 657 (pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) {
658 genpd_lock(genpd);
551 genpd_poweroff(genpd, 0); 659 genpd_poweroff(genpd, 0);
552 mutex_unlock(&genpd->lock); 660 genpd_unlock(genpd);
553 } 661 }
554 662
555 return ret; 663 return ret;
@@ -732,20 +840,20 @@ static int pm_genpd_prepare(struct device *dev)
732 if (resume_needed(dev, genpd)) 840 if (resume_needed(dev, genpd))
733 pm_runtime_resume(dev); 841 pm_runtime_resume(dev);
734 842
735 mutex_lock(&genpd->lock); 843 genpd_lock(genpd);
736 844
737 if (genpd->prepared_count++ == 0) 845 if (genpd->prepared_count++ == 0)
738 genpd->suspended_count = 0; 846 genpd->suspended_count = 0;
739 847
740 mutex_unlock(&genpd->lock); 848 genpd_unlock(genpd);
741 849
742 ret = pm_generic_prepare(dev); 850 ret = pm_generic_prepare(dev);
743 if (ret) { 851 if (ret) {
744 mutex_lock(&genpd->lock); 852 genpd_lock(genpd);
745 853
746 genpd->prepared_count--; 854 genpd->prepared_count--;
747 855
748 mutex_unlock(&genpd->lock); 856 genpd_unlock(genpd);
749 } 857 }
750 858
751 return ret; 859 return ret;
@@ -936,13 +1044,13 @@ static void pm_genpd_complete(struct device *dev)
936 1044
937 pm_generic_complete(dev); 1045 pm_generic_complete(dev);
938 1046
939 mutex_lock(&genpd->lock); 1047 genpd_lock(genpd);
940 1048
941 genpd->prepared_count--; 1049 genpd->prepared_count--;
942 if (!genpd->prepared_count) 1050 if (!genpd->prepared_count)
943 genpd_queue_power_off_work(genpd); 1051 genpd_queue_power_off_work(genpd);
944 1052
945 mutex_unlock(&genpd->lock); 1053 genpd_unlock(genpd);
946} 1054}
947 1055
948/** 1056/**
@@ -1071,7 +1179,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
1071 if (IS_ERR(gpd_data)) 1179 if (IS_ERR(gpd_data))
1072 return PTR_ERR(gpd_data); 1180 return PTR_ERR(gpd_data);
1073 1181
1074 mutex_lock(&genpd->lock); 1182 genpd_lock(genpd);
1075 1183
1076 if (genpd->prepared_count > 0) { 1184 if (genpd->prepared_count > 0) {
1077 ret = -EAGAIN; 1185 ret = -EAGAIN;
@@ -1088,7 +1196,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
1088 list_add_tail(&gpd_data->base.list_node, &genpd->dev_list); 1196 list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
1089 1197
1090 out: 1198 out:
1091 mutex_unlock(&genpd->lock); 1199 genpd_unlock(genpd);
1092 1200
1093 if (ret) 1201 if (ret)
1094 genpd_free_dev_data(dev, gpd_data); 1202 genpd_free_dev_data(dev, gpd_data);
@@ -1130,7 +1238,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
1130 gpd_data = to_gpd_data(pdd); 1238 gpd_data = to_gpd_data(pdd);
1131 dev_pm_qos_remove_notifier(dev, &gpd_data->nb); 1239 dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
1132 1240
1133 mutex_lock(&genpd->lock); 1241 genpd_lock(genpd);
1134 1242
1135 if (genpd->prepared_count > 0) { 1243 if (genpd->prepared_count > 0) {
1136 ret = -EAGAIN; 1244 ret = -EAGAIN;
@@ -1145,14 +1253,14 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
1145 1253
1146 list_del_init(&pdd->list_node); 1254 list_del_init(&pdd->list_node);
1147 1255
1148 mutex_unlock(&genpd->lock); 1256 genpd_unlock(genpd);
1149 1257
1150 genpd_free_dev_data(dev, gpd_data); 1258 genpd_free_dev_data(dev, gpd_data);
1151 1259
1152 return 0; 1260 return 0;
1153 1261
1154 out: 1262 out:
1155 mutex_unlock(&genpd->lock); 1263 genpd_unlock(genpd);
1156 dev_pm_qos_add_notifier(dev, &gpd_data->nb); 1264 dev_pm_qos_add_notifier(dev, &gpd_data->nb);
1157 1265
1158 return ret; 1266 return ret;
@@ -1183,12 +1291,23 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
1183 || genpd == subdomain) 1291 || genpd == subdomain)
1184 return -EINVAL; 1292 return -EINVAL;
1185 1293
1294 /*
1295 * If the domain can be powered on/off in an IRQ safe
1296 * context, ensure that the subdomain can also be
1297 * powered on/off in that context.
1298 */
1299 if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
1300 WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
1301 genpd->name, subdomain->name);
1302 return -EINVAL;
1303 }
1304
1186 link = kzalloc(sizeof(*link), GFP_KERNEL); 1305 link = kzalloc(sizeof(*link), GFP_KERNEL);
1187 if (!link) 1306 if (!link)
1188 return -ENOMEM; 1307 return -ENOMEM;
1189 1308
1190 mutex_lock(&subdomain->lock); 1309 genpd_lock(subdomain);
1191 mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); 1310 genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
1192 1311
1193 if (genpd->status == GPD_STATE_POWER_OFF 1312 if (genpd->status == GPD_STATE_POWER_OFF
1194 && subdomain->status != GPD_STATE_POWER_OFF) { 1313 && subdomain->status != GPD_STATE_POWER_OFF) {
@@ -1211,8 +1330,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
1211 genpd_sd_counter_inc(genpd); 1330 genpd_sd_counter_inc(genpd);
1212 1331
1213 out: 1332 out:
1214 mutex_unlock(&genpd->lock); 1333 genpd_unlock(genpd);
1215 mutex_unlock(&subdomain->lock); 1334 genpd_unlock(subdomain);
1216 if (ret) 1335 if (ret)
1217 kfree(link); 1336 kfree(link);
1218 return ret; 1337 return ret;
@@ -1250,8 +1369,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
1250 if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain)) 1369 if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
1251 return -EINVAL; 1370 return -EINVAL;
1252 1371
1253 mutex_lock(&subdomain->lock); 1372 genpd_lock(subdomain);
1254 mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); 1373 genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
1255 1374
1256 if (!list_empty(&subdomain->master_links) || subdomain->device_count) { 1375 if (!list_empty(&subdomain->master_links) || subdomain->device_count) {
1257 pr_warn("%s: unable to remove subdomain %s\n", genpd->name, 1376 pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
@@ -1275,13 +1394,39 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
1275 } 1394 }
1276 1395
1277out: 1396out:
1278 mutex_unlock(&genpd->lock); 1397 genpd_unlock(genpd);
1279 mutex_unlock(&subdomain->lock); 1398 genpd_unlock(subdomain);
1280 1399
1281 return ret; 1400 return ret;
1282} 1401}
1283EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain); 1402EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
1284 1403
1404static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
1405{
1406 struct genpd_power_state *state;
1407
1408 state = kzalloc(sizeof(*state), GFP_KERNEL);
1409 if (!state)
1410 return -ENOMEM;
1411
1412 genpd->states = state;
1413 genpd->state_count = 1;
1414 genpd->free = state;
1415
1416 return 0;
1417}
1418
1419static void genpd_lock_init(struct generic_pm_domain *genpd)
1420{
1421 if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
1422 spin_lock_init(&genpd->slock);
1423 genpd->lock_ops = &genpd_spin_ops;
1424 } else {
1425 mutex_init(&genpd->mlock);
1426 genpd->lock_ops = &genpd_mtx_ops;
1427 }
1428}
1429
1285/** 1430/**
1286 * pm_genpd_init - Initialize a generic I/O PM domain object. 1431 * pm_genpd_init - Initialize a generic I/O PM domain object.
1287 * @genpd: PM domain object to initialize. 1432 * @genpd: PM domain object to initialize.
@@ -1293,13 +1438,15 @@ EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
1293int pm_genpd_init(struct generic_pm_domain *genpd, 1438int pm_genpd_init(struct generic_pm_domain *genpd,
1294 struct dev_power_governor *gov, bool is_off) 1439 struct dev_power_governor *gov, bool is_off)
1295{ 1440{
1441 int ret;
1442
1296 if (IS_ERR_OR_NULL(genpd)) 1443 if (IS_ERR_OR_NULL(genpd))
1297 return -EINVAL; 1444 return -EINVAL;
1298 1445
1299 INIT_LIST_HEAD(&genpd->master_links); 1446 INIT_LIST_HEAD(&genpd->master_links);
1300 INIT_LIST_HEAD(&genpd->slave_links); 1447 INIT_LIST_HEAD(&genpd->slave_links);
1301 INIT_LIST_HEAD(&genpd->dev_list); 1448 INIT_LIST_HEAD(&genpd->dev_list);
1302 mutex_init(&genpd->lock); 1449 genpd_lock_init(genpd);
1303 genpd->gov = gov; 1450 genpd->gov = gov;
1304 INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); 1451 INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
1305 atomic_set(&genpd->sd_count, 0); 1452 atomic_set(&genpd->sd_count, 0);
@@ -1325,19 +1472,12 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
1325 genpd->dev_ops.start = pm_clk_resume; 1472 genpd->dev_ops.start = pm_clk_resume;
1326 } 1473 }
1327 1474
1328 if (genpd->state_idx >= GENPD_MAX_NUM_STATES) {
1329 pr_warn("Initial state index out of bounds.\n");
1330 genpd->state_idx = GENPD_MAX_NUM_STATES - 1;
1331 }
1332
1333 if (genpd->state_count > GENPD_MAX_NUM_STATES) {
1334 pr_warn("Limiting states to %d\n", GENPD_MAX_NUM_STATES);
1335 genpd->state_count = GENPD_MAX_NUM_STATES;
1336 }
1337
1338 /* Use only one "off" state if there were no states declared */ 1475 /* Use only one "off" state if there were no states declared */
1339 if (genpd->state_count == 0) 1476 if (genpd->state_count == 0) {
1340 genpd->state_count = 1; 1477 ret = genpd_set_default_power_state(genpd);
1478 if (ret)
1479 return ret;
1480 }
1341 1481
1342 mutex_lock(&gpd_list_lock); 1482 mutex_lock(&gpd_list_lock);
1343 list_add(&genpd->gpd_list_node, &gpd_list); 1483 list_add(&genpd->gpd_list_node, &gpd_list);
@@ -1354,16 +1494,16 @@ static int genpd_remove(struct generic_pm_domain *genpd)
1354 if (IS_ERR_OR_NULL(genpd)) 1494 if (IS_ERR_OR_NULL(genpd))
1355 return -EINVAL; 1495 return -EINVAL;
1356 1496
1357 mutex_lock(&genpd->lock); 1497 genpd_lock(genpd);
1358 1498
1359 if (genpd->has_provider) { 1499 if (genpd->has_provider) {
1360 mutex_unlock(&genpd->lock); 1500 genpd_unlock(genpd);
1361 pr_err("Provider present, unable to remove %s\n", genpd->name); 1501 pr_err("Provider present, unable to remove %s\n", genpd->name);
1362 return -EBUSY; 1502 return -EBUSY;
1363 } 1503 }
1364 1504
1365 if (!list_empty(&genpd->master_links) || genpd->device_count) { 1505 if (!list_empty(&genpd->master_links) || genpd->device_count) {
1366 mutex_unlock(&genpd->lock); 1506 genpd_unlock(genpd);
1367 pr_err("%s: unable to remove %s\n", __func__, genpd->name); 1507 pr_err("%s: unable to remove %s\n", __func__, genpd->name);
1368 return -EBUSY; 1508 return -EBUSY;
1369 } 1509 }
@@ -1375,8 +1515,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
1375 } 1515 }
1376 1516
1377 list_del(&genpd->gpd_list_node); 1517 list_del(&genpd->gpd_list_node);
1378 mutex_unlock(&genpd->lock); 1518 genpd_unlock(genpd);
1379 cancel_work_sync(&genpd->power_off_work); 1519 cancel_work_sync(&genpd->power_off_work);
1520 kfree(genpd->free);
1380 pr_debug("%s: removed %s\n", __func__, genpd->name); 1521 pr_debug("%s: removed %s\n", __func__, genpd->name);
1381 1522
1382 return 0; 1523 return 0;
@@ -1890,21 +2031,117 @@ int genpd_dev_pm_attach(struct device *dev)
1890 mutex_unlock(&gpd_list_lock); 2031 mutex_unlock(&gpd_list_lock);
1891 2032
1892 if (ret < 0) { 2033 if (ret < 0) {
1893 dev_err(dev, "failed to add to PM domain %s: %d", 2034 if (ret != -EPROBE_DEFER)
1894 pd->name, ret); 2035 dev_err(dev, "failed to add to PM domain %s: %d",
2036 pd->name, ret);
1895 goto out; 2037 goto out;
1896 } 2038 }
1897 2039
1898 dev->pm_domain->detach = genpd_dev_pm_detach; 2040 dev->pm_domain->detach = genpd_dev_pm_detach;
1899 dev->pm_domain->sync = genpd_dev_pm_sync; 2041 dev->pm_domain->sync = genpd_dev_pm_sync;
1900 2042
1901 mutex_lock(&pd->lock); 2043 genpd_lock(pd);
1902 ret = genpd_poweron(pd, 0); 2044 ret = genpd_poweron(pd, 0);
1903 mutex_unlock(&pd->lock); 2045 genpd_unlock(pd);
1904out: 2046out:
1905 return ret ? -EPROBE_DEFER : 0; 2047 return ret ? -EPROBE_DEFER : 0;
1906} 2048}
1907EXPORT_SYMBOL_GPL(genpd_dev_pm_attach); 2049EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
2050
2051static const struct of_device_id idle_state_match[] = {
2052 { .compatible = "domain-idle-state", },
2053 { }
2054};
2055
2056static int genpd_parse_state(struct genpd_power_state *genpd_state,
2057 struct device_node *state_node)
2058{
2059 int err;
2060 u32 residency;
2061 u32 entry_latency, exit_latency;
2062 const struct of_device_id *match_id;
2063
2064 match_id = of_match_node(idle_state_match, state_node);
2065 if (!match_id)
2066 return -EINVAL;
2067
2068 err = of_property_read_u32(state_node, "entry-latency-us",
2069 &entry_latency);
2070 if (err) {
2071 pr_debug(" * %s missing entry-latency-us property\n",
2072 state_node->full_name);
2073 return -EINVAL;
2074 }
2075
2076 err = of_property_read_u32(state_node, "exit-latency-us",
2077 &exit_latency);
2078 if (err) {
2079 pr_debug(" * %s missing exit-latency-us property\n",
2080 state_node->full_name);
2081 return -EINVAL;
2082 }
2083
2084 err = of_property_read_u32(state_node, "min-residency-us", &residency);
2085 if (!err)
2086 genpd_state->residency_ns = 1000 * residency;
2087
2088 genpd_state->power_on_latency_ns = 1000 * exit_latency;
2089 genpd_state->power_off_latency_ns = 1000 * entry_latency;
2090 genpd_state->fwnode = &state_node->fwnode;
2091
2092 return 0;
2093}
2094
2095/**
2096 * of_genpd_parse_idle_states: Return array of idle states for the genpd.
2097 *
2098 * @dn: The genpd device node
2099 * @states: The pointer to which the state array will be saved.
2100 * @n: The count of elements in the array returned from this function.
2101 *
2102 * Returns the device states parsed from the OF node. The memory for the states
2103 * is allocated by this function and is the responsibility of the caller to
2104 * free the memory after use.
2105 */
2106int of_genpd_parse_idle_states(struct device_node *dn,
2107 struct genpd_power_state **states, int *n)
2108{
2109 struct genpd_power_state *st;
2110 struct device_node *np;
2111 int i = 0;
2112 int err, ret;
2113 int count;
2114 struct of_phandle_iterator it;
2115
2116 count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
2117 if (count <= 0)
2118 return -EINVAL;
2119
2120 st = kcalloc(count, sizeof(*st), GFP_KERNEL);
2121 if (!st)
2122 return -ENOMEM;
2123
2124 /* Loop over the phandles until all the requested entry is found */
2125 of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
2126 np = it.node;
2127 ret = genpd_parse_state(&st[i++], np);
2128 if (ret) {
2129 pr_err
2130 ("Parsing idle state node %s failed with err %d\n",
2131 np->full_name, ret);
2132 of_node_put(np);
2133 kfree(st);
2134 return ret;
2135 }
2136 }
2137
2138 *n = count;
2139 *states = st;
2140
2141 return 0;
2142}
2143EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
2144
1908#endif /* CONFIG_PM_GENERIC_DOMAINS_OF */ 2145#endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
1909 2146
1910 2147
@@ -1958,7 +2195,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
1958 char state[16]; 2195 char state[16];
1959 int ret; 2196 int ret;
1960 2197
1961 ret = mutex_lock_interruptible(&genpd->lock); 2198 ret = genpd_lock_interruptible(genpd);
1962 if (ret) 2199 if (ret)
1963 return -ERESTARTSYS; 2200 return -ERESTARTSYS;
1964 2201
@@ -1984,7 +2221,9 @@ static int pm_genpd_summary_one(struct seq_file *s,
1984 } 2221 }
1985 2222
1986 list_for_each_entry(pm_data, &genpd->dev_list, list_node) { 2223 list_for_each_entry(pm_data, &genpd->dev_list, list_node) {
1987 kobj_path = kobject_get_path(&pm_data->dev->kobj, GFP_KERNEL); 2224 kobj_path = kobject_get_path(&pm_data->dev->kobj,
2225 genpd_is_irq_safe(genpd) ?
2226 GFP_ATOMIC : GFP_KERNEL);
1988 if (kobj_path == NULL) 2227 if (kobj_path == NULL)
1989 continue; 2228 continue;
1990 2229
@@ -1995,7 +2234,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
1995 2234
1996 seq_puts(s, "\n"); 2235 seq_puts(s, "\n");
1997exit: 2236exit:
1998 mutex_unlock(&genpd->lock); 2237 genpd_unlock(genpd);
1999 2238
2000 return 0; 2239 return 0;
2001} 2240}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2932a5bd892f..eb474c882ebe 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1460,10 +1460,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
1460 dpm_watchdog_clear(&wd); 1460 dpm_watchdog_clear(&wd);
1461 1461
1462 Complete: 1462 Complete:
1463 complete_all(&dev->power.completion);
1464 if (error) 1463 if (error)
1465 async_error = error; 1464 async_error = error;
1466 1465
1466 complete_all(&dev->power.completion);
1467 TRACE_SUSPEND(error); 1467 TRACE_SUSPEND(error);
1468 return error; 1468 return error;
1469} 1469}
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 4c7c6da7a989..35ff06283738 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -93,6 +93,8 @@ struct opp_table *_find_opp_table(struct device *dev)
93 * Return: voltage in micro volt corresponding to the opp, else 93 * Return: voltage in micro volt corresponding to the opp, else
94 * return 0 94 * return 0
95 * 95 *
96 * This is useful only for devices with single power supply.
97 *
96 * Locking: This function must be called under rcu_read_lock(). opp is a rcu 98 * Locking: This function must be called under rcu_read_lock(). opp is a rcu
97 * protected pointer. This means that opp which could have been fetched by 99 * protected pointer. This means that opp which could have been fetched by
98 * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are 100 * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
@@ -112,7 +114,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
112 if (IS_ERR_OR_NULL(tmp_opp)) 114 if (IS_ERR_OR_NULL(tmp_opp))
113 pr_err("%s: Invalid parameters\n", __func__); 115 pr_err("%s: Invalid parameters\n", __func__);
114 else 116 else
115 v = tmp_opp->u_volt; 117 v = tmp_opp->supplies[0].u_volt;
116 118
117 return v; 119 return v;
118} 120}
@@ -210,6 +212,24 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
210} 212}
211EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency); 213EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
212 214
215static int _get_regulator_count(struct device *dev)
216{
217 struct opp_table *opp_table;
218 int count;
219
220 rcu_read_lock();
221
222 opp_table = _find_opp_table(dev);
223 if (!IS_ERR(opp_table))
224 count = opp_table->regulator_count;
225 else
226 count = 0;
227
228 rcu_read_unlock();
229
230 return count;
231}
232
213/** 233/**
214 * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds 234 * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
215 * @dev: device for which we do this operation 235 * @dev: device for which we do this operation
@@ -222,34 +242,51 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
222{ 242{
223 struct opp_table *opp_table; 243 struct opp_table *opp_table;
224 struct dev_pm_opp *opp; 244 struct dev_pm_opp *opp;
225 struct regulator *reg; 245 struct regulator *reg, **regulators;
226 unsigned long latency_ns = 0; 246 unsigned long latency_ns = 0;
227 unsigned long min_uV = ~0, max_uV = 0; 247 int ret, i, count;
228 int ret; 248 struct {
249 unsigned long min;
250 unsigned long max;
251 } *uV;
252
253 count = _get_regulator_count(dev);
254
255 /* Regulator may not be required for the device */
256 if (!count)
257 return 0;
258
259 regulators = kmalloc_array(count, sizeof(*regulators), GFP_KERNEL);
260 if (!regulators)
261 return 0;
262
263 uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
264 if (!uV)
265 goto free_regulators;
229 266
230 rcu_read_lock(); 267 rcu_read_lock();
231 268
232 opp_table = _find_opp_table(dev); 269 opp_table = _find_opp_table(dev);
233 if (IS_ERR(opp_table)) { 270 if (IS_ERR(opp_table)) {
234 rcu_read_unlock(); 271 rcu_read_unlock();
235 return 0; 272 goto free_uV;
236 } 273 }
237 274
238 reg = opp_table->regulator; 275 memcpy(regulators, opp_table->regulators, count * sizeof(*regulators));
239 if (IS_ERR(reg)) {
240 /* Regulator may not be required for device */
241 rcu_read_unlock();
242 return 0;
243 }
244 276
245 list_for_each_entry_rcu(opp, &opp_table->opp_list, node) { 277 for (i = 0; i < count; i++) {
246 if (!opp->available) 278 uV[i].min = ~0;
247 continue; 279 uV[i].max = 0;
280
281 list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
282 if (!opp->available)
283 continue;
248 284
249 if (opp->u_volt_min < min_uV) 285 if (opp->supplies[i].u_volt_min < uV[i].min)
250 min_uV = opp->u_volt_min; 286 uV[i].min = opp->supplies[i].u_volt_min;
251 if (opp->u_volt_max > max_uV) 287 if (opp->supplies[i].u_volt_max > uV[i].max)
252 max_uV = opp->u_volt_max; 288 uV[i].max = opp->supplies[i].u_volt_max;
289 }
253 } 290 }
254 291
255 rcu_read_unlock(); 292 rcu_read_unlock();
@@ -258,9 +295,16 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
258 * The caller needs to ensure that opp_table (and hence the regulator) 295 * The caller needs to ensure that opp_table (and hence the regulator)
259 * isn't freed, while we are executing this routine. 296 * isn't freed, while we are executing this routine.
260 */ 297 */
261 ret = regulator_set_voltage_time(reg, min_uV, max_uV); 298 for (i = 0; reg = regulators[i], i < count; i++) {
262 if (ret > 0) 299 ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
263 latency_ns = ret * 1000; 300 if (ret > 0)
301 latency_ns += ret * 1000;
302 }
303
304free_uV:
305 kfree(uV);
306free_regulators:
307 kfree(regulators);
264 308
265 return latency_ns; 309 return latency_ns;
266} 310}
@@ -542,8 +586,7 @@ unlock:
542} 586}
543 587
544static int _set_opp_voltage(struct device *dev, struct regulator *reg, 588static int _set_opp_voltage(struct device *dev, struct regulator *reg,
545 unsigned long u_volt, unsigned long u_volt_min, 589 struct dev_pm_opp_supply *supply)
546 unsigned long u_volt_max)
547{ 590{
548 int ret; 591 int ret;
549 592
@@ -554,14 +597,78 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
554 return 0; 597 return 0;
555 } 598 }
556 599
557 dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min, 600 dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
558 u_volt, u_volt_max); 601 supply->u_volt_min, supply->u_volt, supply->u_volt_max);
559 602
560 ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt, 603 ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
561 u_volt_max); 604 supply->u_volt, supply->u_volt_max);
562 if (ret) 605 if (ret)
563 dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n", 606 dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
564 __func__, u_volt_min, u_volt, u_volt_max, ret); 607 __func__, supply->u_volt_min, supply->u_volt,
608 supply->u_volt_max, ret);
609
610 return ret;
611}
612
613static inline int
614_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
615 unsigned long old_freq, unsigned long freq)
616{
617 int ret;
618
619 ret = clk_set_rate(clk, freq);
620 if (ret) {
621 dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
622 ret);
623 }
624
625 return ret;
626}
627
628static int _generic_set_opp(struct dev_pm_set_opp_data *data)
629{
630 struct dev_pm_opp_supply *old_supply = data->old_opp.supplies;
631 struct dev_pm_opp_supply *new_supply = data->new_opp.supplies;
632 unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate;
633 struct regulator *reg = data->regulators[0];
634 struct device *dev= data->dev;
635 int ret;
636
637 /* This function only supports single regulator per device */
638 if (WARN_ON(data->regulator_count > 1)) {
639 dev_err(dev, "multiple regulators are not supported\n");
640 return -EINVAL;
641 }
642
643 /* Scaling up? Scale voltage before frequency */
644 if (freq > old_freq) {
645 ret = _set_opp_voltage(dev, reg, new_supply);
646 if (ret)
647 goto restore_voltage;
648 }
649
650 /* Change frequency */
651 ret = _generic_set_opp_clk_only(dev, data->clk, old_freq, freq);
652 if (ret)
653 goto restore_voltage;
654
655 /* Scaling down? Scale voltage after frequency */
656 if (freq < old_freq) {
657 ret = _set_opp_voltage(dev, reg, new_supply);
658 if (ret)
659 goto restore_freq;
660 }
661
662 return 0;
663
664restore_freq:
665 if (_generic_set_opp_clk_only(dev, data->clk, freq, old_freq))
666 dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
667 __func__, old_freq);
668restore_voltage:
669 /* This shouldn't harm even if the voltages weren't updated earlier */
670 if (old_supply->u_volt)
671 _set_opp_voltage(dev, reg, old_supply);
565 672
566 return ret; 673 return ret;
567} 674}
@@ -579,12 +686,13 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
579int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) 686int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
580{ 687{
581 struct opp_table *opp_table; 688 struct opp_table *opp_table;
689 unsigned long freq, old_freq;
690 int (*set_opp)(struct dev_pm_set_opp_data *data);
582 struct dev_pm_opp *old_opp, *opp; 691 struct dev_pm_opp *old_opp, *opp;
583 struct regulator *reg; 692 struct regulator **regulators;
693 struct dev_pm_set_opp_data *data;
584 struct clk *clk; 694 struct clk *clk;
585 unsigned long freq, old_freq; 695 int ret, size;
586 unsigned long u_volt, u_volt_min, u_volt_max;
587 int ret;
588 696
589 if (unlikely(!target_freq)) { 697 if (unlikely(!target_freq)) {
590 dev_err(dev, "%s: Invalid target frequency %lu\n", __func__, 698 dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
@@ -633,55 +741,41 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
633 return ret; 741 return ret;
634 } 742 }
635 743
636 u_volt = opp->u_volt; 744 dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
637 u_volt_min = opp->u_volt_min; 745 old_freq, freq);
638 u_volt_max = opp->u_volt_max;
639 746
640 reg = opp_table->regulator; 747 regulators = opp_table->regulators;
641 748
642 rcu_read_unlock(); 749 /* Only frequency scaling */
643 750 if (!regulators) {
644 /* Scaling up? Scale voltage before frequency */ 751 rcu_read_unlock();
645 if (freq > old_freq) { 752 return _generic_set_opp_clk_only(dev, clk, old_freq, freq);
646 ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
647 u_volt_max);
648 if (ret)
649 goto restore_voltage;
650 }
651
652 /* Change frequency */
653
654 dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
655 __func__, old_freq, freq);
656
657 ret = clk_set_rate(clk, freq);
658 if (ret) {
659 dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
660 ret);
661 goto restore_voltage;
662 } 753 }
663 754
664 /* Scaling down? Scale voltage after frequency */ 755 if (opp_table->set_opp)
665 if (freq < old_freq) { 756 set_opp = opp_table->set_opp;
666 ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min, 757 else
667 u_volt_max); 758 set_opp = _generic_set_opp;
668 if (ret) 759
669 goto restore_freq; 760 data = opp_table->set_opp_data;
670 } 761 data->regulators = regulators;
762 data->regulator_count = opp_table->regulator_count;
763 data->clk = clk;
764 data->dev = dev;
765
766 data->old_opp.rate = old_freq;
767 size = sizeof(*opp->supplies) * opp_table->regulator_count;
768 if (IS_ERR(old_opp))
769 memset(data->old_opp.supplies, 0, size);
770 else
771 memcpy(data->old_opp.supplies, old_opp->supplies, size);
671 772
672 return 0; 773 data->new_opp.rate = freq;
774 memcpy(data->new_opp.supplies, opp->supplies, size);
673 775
674restore_freq: 776 rcu_read_unlock();
675 if (clk_set_rate(clk, old_freq))
676 dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
677 __func__, old_freq);
678restore_voltage:
679 /* This shouldn't harm even if the voltages weren't updated earlier */
680 if (!IS_ERR(old_opp))
681 _set_opp_voltage(dev, reg, old_opp->u_volt,
682 old_opp->u_volt_min, old_opp->u_volt_max);
683 777
684 return ret; 778 return set_opp(data);
685} 779}
686EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate); 780EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
687 781
@@ -764,9 +858,6 @@ static struct opp_table *_add_opp_table(struct device *dev)
764 858
765 _of_init_opp_table(opp_table, dev); 859 _of_init_opp_table(opp_table, dev);
766 860
767 /* Set regulator to a non-NULL error value */
768 opp_table->regulator = ERR_PTR(-ENXIO);
769
770 /* Find clk for the device */ 861 /* Find clk for the device */
771 opp_table->clk = clk_get(dev, NULL); 862 opp_table->clk = clk_get(dev, NULL);
772 if (IS_ERR(opp_table->clk)) { 863 if (IS_ERR(opp_table->clk)) {
@@ -815,7 +906,10 @@ static void _remove_opp_table(struct opp_table *opp_table)
815 if (opp_table->prop_name) 906 if (opp_table->prop_name)
816 return; 907 return;
817 908
818 if (!IS_ERR(opp_table->regulator)) 909 if (opp_table->regulators)
910 return;
911
912 if (opp_table->set_opp)
819 return; 913 return;
820 914
821 /* Release clk */ 915 /* Release clk */
@@ -924,34 +1018,50 @@ struct dev_pm_opp *_allocate_opp(struct device *dev,
924 struct opp_table **opp_table) 1018 struct opp_table **opp_table)
925{ 1019{
926 struct dev_pm_opp *opp; 1020 struct dev_pm_opp *opp;
1021 int count, supply_size;
1022 struct opp_table *table;
927 1023
928 /* allocate new OPP node */ 1024 table = _add_opp_table(dev);
929 opp = kzalloc(sizeof(*opp), GFP_KERNEL); 1025 if (!table)
930 if (!opp)
931 return NULL; 1026 return NULL;
932 1027
933 INIT_LIST_HEAD(&opp->node); 1028 /* Allocate space for at least one supply */
1029 count = table->regulator_count ? table->regulator_count : 1;
1030 supply_size = sizeof(*opp->supplies) * count;
934 1031
935 *opp_table = _add_opp_table(dev); 1032 /* allocate new OPP node and supplies structures */
936 if (!*opp_table) { 1033 opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
937 kfree(opp); 1034 if (!opp) {
1035 kfree(table);
938 return NULL; 1036 return NULL;
939 } 1037 }
940 1038
1039 /* Put the supplies at the end of the OPP structure as an empty array */
1040 opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
1041 INIT_LIST_HEAD(&opp->node);
1042
1043 *opp_table = table;
1044
941 return opp; 1045 return opp;
942} 1046}
943 1047
944static bool _opp_supported_by_regulators(struct dev_pm_opp *opp, 1048static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
945 struct opp_table *opp_table) 1049 struct opp_table *opp_table)
946{ 1050{
947 struct regulator *reg = opp_table->regulator; 1051 struct regulator *reg;
948 1052 int i;
949 if (!IS_ERR(reg) && 1053
950 !regulator_is_supported_voltage(reg, opp->u_volt_min, 1054 for (i = 0; i < opp_table->regulator_count; i++) {
951 opp->u_volt_max)) { 1055 reg = opp_table->regulators[i];
952 pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n", 1056
953 __func__, opp->u_volt_min, opp->u_volt_max); 1057 if (!regulator_is_supported_voltage(reg,
954 return false; 1058 opp->supplies[i].u_volt_min,
1059 opp->supplies[i].u_volt_max)) {
1060 pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
1061 __func__, opp->supplies[i].u_volt_min,
1062 opp->supplies[i].u_volt_max);
1063 return false;
1064 }
955 } 1065 }
956 1066
957 return true; 1067 return true;
@@ -983,11 +1093,13 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
983 1093
984 /* Duplicate OPPs */ 1094 /* Duplicate OPPs */
985 dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n", 1095 dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
986 __func__, opp->rate, opp->u_volt, opp->available, 1096 __func__, opp->rate, opp->supplies[0].u_volt,
987 new_opp->rate, new_opp->u_volt, new_opp->available); 1097 opp->available, new_opp->rate,
1098 new_opp->supplies[0].u_volt, new_opp->available);
988 1099
989 return opp->available && new_opp->u_volt == opp->u_volt ? 1100 /* Should we compare voltages for all regulators here ? */
990 0 : -EEXIST; 1101 return opp->available &&
1102 new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? 0 : -EEXIST;
991 } 1103 }
992 1104
993 new_opp->opp_table = opp_table; 1105 new_opp->opp_table = opp_table;
@@ -1054,9 +1166,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
1054 /* populate the opp table */ 1166 /* populate the opp table */
1055 new_opp->rate = freq; 1167 new_opp->rate = freq;
1056 tol = u_volt * opp_table->voltage_tolerance_v1 / 100; 1168 tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
1057 new_opp->u_volt = u_volt; 1169 new_opp->supplies[0].u_volt = u_volt;
1058 new_opp->u_volt_min = u_volt - tol; 1170 new_opp->supplies[0].u_volt_min = u_volt - tol;
1059 new_opp->u_volt_max = u_volt + tol; 1171 new_opp->supplies[0].u_volt_max = u_volt + tol;
1060 new_opp->available = true; 1172 new_opp->available = true;
1061 new_opp->dynamic = dynamic; 1173 new_opp->dynamic = dynamic;
1062 1174
@@ -1300,13 +1412,47 @@ unlock:
1300} 1412}
1301EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name); 1413EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
1302 1414
1415static int _allocate_set_opp_data(struct opp_table *opp_table)
1416{
1417 struct dev_pm_set_opp_data *data;
1418 int len, count = opp_table->regulator_count;
1419
1420 if (WARN_ON(!count))
1421 return -EINVAL;
1422
1423 /* space for set_opp_data */
1424 len = sizeof(*data);
1425
1426 /* space for old_opp.supplies and new_opp.supplies */
1427 len += 2 * sizeof(struct dev_pm_opp_supply) * count;
1428
1429 data = kzalloc(len, GFP_KERNEL);
1430 if (!data)
1431 return -ENOMEM;
1432
1433 data->old_opp.supplies = (void *)(data + 1);
1434 data->new_opp.supplies = data->old_opp.supplies + count;
1435
1436 opp_table->set_opp_data = data;
1437
1438 return 0;
1439}
1440
1441static void _free_set_opp_data(struct opp_table *opp_table)
1442{
1443 kfree(opp_table->set_opp_data);
1444 opp_table->set_opp_data = NULL;
1445}
1446
1303/** 1447/**
1304 * dev_pm_opp_set_regulator() - Set regulator name for the device 1448 * dev_pm_opp_set_regulators() - Set regulator names for the device
1305 * @dev: Device for which regulator name is being set. 1449 * @dev: Device for which regulator name is being set.
1306 * @name: Name of the regulator. 1450 * @names: Array of pointers to the names of the regulator.
1451 * @count: Number of regulators.
1307 * 1452 *
1308 * In order to support OPP switching, OPP layer needs to know the name of the 1453 * In order to support OPP switching, OPP layer needs to know the name of the
1309 * device's regulator, as the core would be required to switch voltages as well. 1454 * device's regulators, as the core would be required to switch voltages as
1455 * well.
1310 * 1456 *
1311 * This must be called before any OPPs are initialized for the device. 1457 * This must be called before any OPPs are initialized for the device.
1312 * 1458 *
@@ -1316,11 +1462,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
1316 * that this function is *NOT* called under RCU protection or in contexts where 1462 * that this function is *NOT* called under RCU protection or in contexts where
1317 * mutex cannot be locked. 1463 * mutex cannot be locked.
1318 */ 1464 */
1319int dev_pm_opp_set_regulator(struct device *dev, const char *name) 1465struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
1466 const char * const names[],
1467 unsigned int count)
1320{ 1468{
1321 struct opp_table *opp_table; 1469 struct opp_table *opp_table;
1322 struct regulator *reg; 1470 struct regulator *reg;
1323 int ret; 1471 int ret, i;
1324 1472
1325 mutex_lock(&opp_table_lock); 1473 mutex_lock(&opp_table_lock);
1326 1474
@@ -1336,22 +1484,146 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
1336 goto err; 1484 goto err;
1337 } 1485 }
1338 1486
1339 /* Already have a regulator set */ 1487 /* Already have regulators set */
1340 if (WARN_ON(!IS_ERR(opp_table->regulator))) { 1488 if (opp_table->regulators) {
1341 ret = -EBUSY; 1489 ret = -EBUSY;
1342 goto err; 1490 goto err;
1343 } 1491 }
1344 /* Allocate the regulator */ 1492
1345 reg = regulator_get_optional(dev, name); 1493 opp_table->regulators = kmalloc_array(count,
1346 if (IS_ERR(reg)) { 1494 sizeof(*opp_table->regulators),
1347 ret = PTR_ERR(reg); 1495 GFP_KERNEL);
1348 if (ret != -EPROBE_DEFER) 1496 if (!opp_table->regulators) {
1349 dev_err(dev, "%s: no regulator (%s) found: %d\n", 1497 ret = -ENOMEM;
1350 __func__, name, ret); 1498 goto err;
1499 }
1500
1501 for (i = 0; i < count; i++) {
1502 reg = regulator_get_optional(dev, names[i]);
1503 if (IS_ERR(reg)) {
1504 ret = PTR_ERR(reg);
1505 if (ret != -EPROBE_DEFER)
1506 dev_err(dev, "%s: no regulator (%s) found: %d\n",
1507 __func__, names[i], ret);
1508 goto free_regulators;
1509 }
1510
1511 opp_table->regulators[i] = reg;
1512 }
1513
1514 opp_table->regulator_count = count;
1515
1516 /* Allocate block only once to pass to set_opp() routines */
1517 ret = _allocate_set_opp_data(opp_table);
1518 if (ret)
1519 goto free_regulators;
1520
1521 mutex_unlock(&opp_table_lock);
1522 return opp_table;
1523
1524free_regulators:
1525 while (i != 0)
1526 regulator_put(opp_table->regulators[--i]);
1527
1528 kfree(opp_table->regulators);
1529 opp_table->regulators = NULL;
1530 opp_table->regulator_count = 0;
1531err:
1532 _remove_opp_table(opp_table);
1533unlock:
1534 mutex_unlock(&opp_table_lock);
1535
1536 return ERR_PTR(ret);
1537}
1538EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
1539
1540/**
1541 * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
1542 * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
1543 *
1544 * Locking: The internal opp_table and opp structures are RCU protected.
1545 * Hence this function internally uses RCU updater strategy with mutex locks
1546 * to keep the integrity of the internal data structures. Callers should ensure
1547 * that this function is *NOT* called under RCU protection or in contexts where
1548 * mutex cannot be locked.
1549 */
1550void dev_pm_opp_put_regulators(struct opp_table *opp_table)
1551{
1552 int i;
1553
1554 mutex_lock(&opp_table_lock);
1555
1556 if (!opp_table->regulators) {
1557 pr_err("%s: Doesn't have regulators set\n", __func__);
1558 goto unlock;
1559 }
1560
1561 /* Make sure there are no concurrent readers while updating opp_table */
1562 WARN_ON(!list_empty(&opp_table->opp_list));
1563
1564 for (i = opp_table->regulator_count - 1; i >= 0; i--)
1565 regulator_put(opp_table->regulators[i]);
1566
1567 _free_set_opp_data(opp_table);
1568
1569 kfree(opp_table->regulators);
1570 opp_table->regulators = NULL;
1571 opp_table->regulator_count = 0;
1572
1573 /* Try freeing opp_table if this was the last blocking resource */
1574 _remove_opp_table(opp_table);
1575
1576unlock:
1577 mutex_unlock(&opp_table_lock);
1578}
1579EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
1580
1581/**
1582 * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
1583 * @dev: Device for which the helper is getting registered.
1584 * @set_opp: Custom set OPP helper.
1585 *
1586 * This is useful to support complex platforms (like platforms with multiple
1587 * regulators per device), instead of the generic OPP set rate helper.
1588 *
1589 * This must be called before any OPPs are initialized for the device.
1590 *
1591 * Locking: The internal opp_table and opp structures are RCU protected.
1592 * Hence this function internally uses RCU updater strategy with mutex locks
1593 * to keep the integrity of the internal data structures. Callers should ensure
1594 * that this function is *NOT* called under RCU protection or in contexts where
1595 * mutex cannot be locked.
1596 */
1597int dev_pm_opp_register_set_opp_helper(struct device *dev,
1598 int (*set_opp)(struct dev_pm_set_opp_data *data))
1599{
1600 struct opp_table *opp_table;
1601 int ret;
1602
1603 if (!set_opp)
1604 return -EINVAL;
1605
1606 mutex_lock(&opp_table_lock);
1607
1608 opp_table = _add_opp_table(dev);
1609 if (!opp_table) {
1610 ret = -ENOMEM;
1611 goto unlock;
1612 }
1613
1614 /* This should be called before OPPs are initialized */
1615 if (WARN_ON(!list_empty(&opp_table->opp_list))) {
1616 ret = -EBUSY;
1351 goto err; 1617 goto err;
1352 } 1618 }
1353 1619
1354 opp_table->regulator = reg; 1620 /* Already have custom set_opp helper */
1621 if (WARN_ON(opp_table->set_opp)) {
1622 ret = -EBUSY;
1623 goto err;
1624 }
1625
1626 opp_table->set_opp = set_opp;
1355 1627
1356 mutex_unlock(&opp_table_lock); 1628 mutex_unlock(&opp_table_lock);
1357 return 0; 1629 return 0;
@@ -1363,11 +1635,12 @@ unlock:
1363 1635
1364 return ret; 1636 return ret;
1365} 1637}
1366EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator); 1638EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
1367 1639
1368/** 1640/**
1369 * dev_pm_opp_put_regulator() - Releases resources blocked for regulator 1641 * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
1370 * @dev: Device for which regulator was set. 1642 * set_opp helper
1643 * @dev: Device for which custom set_opp helper has to be cleared.
1371 * 1644 *
1372 * Locking: The internal opp_table and opp structures are RCU protected. 1645 * Locking: The internal opp_table and opp structures are RCU protected.
1373 * Hence this function internally uses RCU updater strategy with mutex locks 1646 * Hence this function internally uses RCU updater strategy with mutex locks
@@ -1375,7 +1648,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
1375 * that this function is *NOT* called under RCU protection or in contexts where 1648 * that this function is *NOT* called under RCU protection or in contexts where
1376 * mutex cannot be locked. 1649 * mutex cannot be locked.
1377 */ 1650 */
1378void dev_pm_opp_put_regulator(struct device *dev) 1651void dev_pm_opp_register_put_opp_helper(struct device *dev)
1379{ 1652{
1380 struct opp_table *opp_table; 1653 struct opp_table *opp_table;
1381 1654
@@ -1389,16 +1662,16 @@ void dev_pm_opp_put_regulator(struct device *dev)
1389 goto unlock; 1662 goto unlock;
1390 } 1663 }
1391 1664
1392 if (IS_ERR(opp_table->regulator)) { 1665 if (!opp_table->set_opp) {
1393 dev_err(dev, "%s: Doesn't have regulator set\n", __func__); 1666 dev_err(dev, "%s: Doesn't have custom set_opp helper set\n",
1667 __func__);
1394 goto unlock; 1668 goto unlock;
1395 } 1669 }
1396 1670
1397 /* Make sure there are no concurrent readers while updating opp_table */ 1671 /* Make sure there are no concurrent readers while updating opp_table */
1398 WARN_ON(!list_empty(&opp_table->opp_list)); 1672 WARN_ON(!list_empty(&opp_table->opp_list));
1399 1673
1400 regulator_put(opp_table->regulator); 1674 opp_table->set_opp = NULL;
1401 opp_table->regulator = ERR_PTR(-ENXIO);
1402 1675
1403 /* Try freeing opp_table if this was the last blocking resource */ 1676 /* Try freeing opp_table if this was the last blocking resource */
1404 _remove_opp_table(opp_table); 1677 _remove_opp_table(opp_table);
@@ -1406,7 +1679,7 @@ void dev_pm_opp_put_regulator(struct device *dev)
1406unlock: 1679unlock:
1407 mutex_unlock(&opp_table_lock); 1680 mutex_unlock(&opp_table_lock);
1408} 1681}
1409EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator); 1682EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
1410 1683
1411/** 1684/**
1412 * dev_pm_opp_add() - Add an OPP table from a table definitions 1685 * dev_pm_opp_add() - Add an OPP table from a table definitions
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index ef1ae6b52042..95f433db4ac7 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -15,6 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/limits.h> 17#include <linux/limits.h>
18#include <linux/slab.h>
18 19
19#include "opp.h" 20#include "opp.h"
20 21
@@ -34,6 +35,46 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
34 debugfs_remove_recursive(opp->dentry); 35 debugfs_remove_recursive(opp->dentry);
35} 36}
36 37
38static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
39 struct opp_table *opp_table,
40 struct dentry *pdentry)
41{
42 struct dentry *d;
43 int i = 0;
44 char *name;
45
46 /* Always create at least supply-0 directory */
47 do {
48 name = kasprintf(GFP_KERNEL, "supply-%d", i);
49
50 /* Create per-opp directory */
51 d = debugfs_create_dir(name, pdentry);
52
53 kfree(name);
54
55 if (!d)
56 return false;
57
58 if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
59 &opp->supplies[i].u_volt))
60 return false;
61
62 if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
63 &opp->supplies[i].u_volt_min))
64 return false;
65
66 if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
67 &opp->supplies[i].u_volt_max))
68 return false;
69
70 if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
71 &opp->supplies[i].u_amp))
72 return false;
73 } while (++i < opp_table->regulator_count);
74
75 return true;
76}
77
37int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) 78int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
38{ 79{
39 struct dentry *pdentry = opp_table->dentry; 80 struct dentry *pdentry = opp_table->dentry;
@@ -63,16 +104,7 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
63 if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate)) 104 if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
64 return -ENOMEM; 105 return -ENOMEM;
65 106
66 if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt)) 107 if (!opp_debug_create_supplies(opp, opp_table, d))
67 return -ENOMEM;
68
69 if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
70 return -ENOMEM;
71
72 if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
73 return -ENOMEM;
74
75 if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
76 return -ENOMEM; 108 return -ENOMEM;
77 109
78 if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d, 110 if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 5552211e6fcd..3f7d2591b173 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/device.h> 18#include <linux/device.h>
19#include <linux/of.h> 19#include <linux/of.h>
20#include <linux/slab.h>
20#include <linux/export.h> 21#include <linux/export.h>
21 22
22#include "opp.h" 23#include "opp.h"
@@ -101,16 +102,16 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
101 return true; 102 return true;
102} 103}
103 104
104/* TODO: Support multiple regulators */
105static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, 105static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
106 struct opp_table *opp_table) 106 struct opp_table *opp_table)
107{ 107{
108 u32 microvolt[3] = {0}; 108 u32 *microvolt, *microamp = NULL;
109 u32 val; 109 int supplies, vcount, icount, ret, i, j;
110 int count, ret;
111 struct property *prop = NULL; 110 struct property *prop = NULL;
112 char name[NAME_MAX]; 111 char name[NAME_MAX];
113 112
113 supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
114
114 /* Search for "opp-microvolt-<name>" */ 115 /* Search for "opp-microvolt-<name>" */
115 if (opp_table->prop_name) { 116 if (opp_table->prop_name) {
116 snprintf(name, sizeof(name), "opp-microvolt-%s", 117 snprintf(name, sizeof(name), "opp-microvolt-%s",
@@ -128,34 +129,29 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
128 return 0; 129 return 0;
129 } 130 }
130 131
131 count = of_property_count_u32_elems(opp->np, name); 132 vcount = of_property_count_u32_elems(opp->np, name);
132 if (count < 0) { 133 if (vcount < 0) {
133 dev_err(dev, "%s: Invalid %s property (%d)\n", 134 dev_err(dev, "%s: Invalid %s property (%d)\n",
134 __func__, name, count); 135 __func__, name, vcount);
135 return count; 136 return vcount;
136 } 137 }
137 138
138 /* There can be one or three elements here */ 139 /* There can be one or three elements per supply */
139 if (count != 1 && count != 3) { 140 if (vcount != supplies && vcount != supplies * 3) {
140 dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n", 141 dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
141 __func__, name, count); 142 __func__, name, vcount, supplies);
142 return -EINVAL; 143 return -EINVAL;
143 } 144 }
144 145
145 ret = of_property_read_u32_array(opp->np, name, microvolt, count); 146 microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
147 if (!microvolt)
148 return -ENOMEM;
149
150 ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
146 if (ret) { 151 if (ret) {
147 dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret); 152 dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
148 return -EINVAL; 153 ret = -EINVAL;
149 } 154 goto free_microvolt;
150
151 opp->u_volt = microvolt[0];
152
153 if (count == 1) {
154 opp->u_volt_min = opp->u_volt;
155 opp->u_volt_max = opp->u_volt;
156 } else {
157 opp->u_volt_min = microvolt[1];
158 opp->u_volt_max = microvolt[2];
159 } 155 }
160 156
161 /* Search for "opp-microamp-<name>" */ 157 /* Search for "opp-microamp-<name>" */
@@ -172,10 +168,59 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
172 prop = of_find_property(opp->np, name, NULL); 168 prop = of_find_property(opp->np, name, NULL);
173 } 169 }
174 170
175 if (prop && !of_property_read_u32(opp->np, name, &val)) 171 if (prop) {
176 opp->u_amp = val; 172 icount = of_property_count_u32_elems(opp->np, name);
173 if (icount < 0) {
174 dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
175 name, icount);
176 ret = icount;
177 goto free_microvolt;
178 }
177 179
178 return 0; 180 if (icount != supplies) {
181 dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
182 __func__, name, icount, supplies);
183 ret = -EINVAL;
184 goto free_microvolt;
185 }
186
187 microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
188 if (!microamp) {
189 ret = -EINVAL;
190 goto free_microvolt;
191 }
192
193 ret = of_property_read_u32_array(opp->np, name, microamp,
194 icount);
195 if (ret) {
196 dev_err(dev, "%s: error parsing %s: %d\n", __func__,
197 name, ret);
198 ret = -EINVAL;
199 goto free_microamp;
200 }
201 }
202
203 for (i = 0, j = 0; i < supplies; i++) {
204 opp->supplies[i].u_volt = microvolt[j++];
205
206 if (vcount == supplies) {
207 opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
208 opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
209 } else {
210 opp->supplies[i].u_volt_min = microvolt[j++];
211 opp->supplies[i].u_volt_max = microvolt[j++];
212 }
213
214 if (microamp)
215 opp->supplies[i].u_amp = microamp[i];
216 }
217
218free_microamp:
219 kfree(microamp);
220free_microvolt:
221 kfree(microvolt);
222
223 return ret;
179} 224}
180 225
181/** 226/**
@@ -198,7 +243,7 @@ void dev_pm_opp_of_remove_table(struct device *dev)
198EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); 243EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
199 244
200/* Returns opp descriptor node for a device, caller must do of_node_put() */ 245/* Returns opp descriptor node for a device, caller must do of_node_put() */
201struct device_node *_of_get_opp_desc_node(struct device *dev) 246static struct device_node *_of_get_opp_desc_node(struct device *dev)
202{ 247{
203 /* 248 /*
204 * TODO: Support for multiple OPP tables. 249 * TODO: Support for multiple OPP tables.
@@ -303,9 +348,9 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
303 mutex_unlock(&opp_table_lock); 348 mutex_unlock(&opp_table_lock);
304 349
305 pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n", 350 pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
306 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt, 351 __func__, new_opp->turbo, new_opp->rate,
307 new_opp->u_volt_min, new_opp->u_volt_max, 352 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
308 new_opp->clock_latency_ns); 353 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
309 354
310 /* 355 /*
311 * Notify the changes in the availability of the operable 356 * Notify the changes in the availability of the operable
@@ -562,7 +607,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
562 /* Get OPP descriptor node */ 607 /* Get OPP descriptor node */
563 np = _of_get_opp_desc_node(cpu_dev); 608 np = _of_get_opp_desc_node(cpu_dev);
564 if (!np) { 609 if (!np) {
565 dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__); 610 dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
566 return -ENOENT; 611 return -ENOENT;
567 } 612 }
568 613
@@ -587,7 +632,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
587 /* Get OPP descriptor node */ 632 /* Get OPP descriptor node */
588 tmp_np = _of_get_opp_desc_node(tcpu_dev); 633 tmp_np = _of_get_opp_desc_node(tcpu_dev);
589 if (!tmp_np) { 634 if (!tmp_np) {
590 dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n", 635 dev_err(tcpu_dev, "%s: Couldn't find opp node.\n",
591 __func__); 636 __func__);
592 ret = -ENOENT; 637 ret = -ENOENT;
593 goto put_cpu_node; 638 goto put_cpu_node;
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index fabd5ca1a083..af9f2b849a66 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -61,10 +61,7 @@ extern struct list_head opp_tables;
61 * @turbo: true if turbo (boost) OPP 61 * @turbo: true if turbo (boost) OPP
62 * @suspend: true if suspend OPP 62 * @suspend: true if suspend OPP
63 * @rate: Frequency in hertz 63 * @rate: Frequency in hertz
64 * @u_volt: Target voltage in microvolts corresponding to this OPP 64 * @supplies: Power supplies voltage/current values
65 * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP
66 * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP
67 * @u_amp: Maximum current drawn by the device in microamperes
68 * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's 65 * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
69 * frequency from any other OPP's frequency. 66 * frequency from any other OPP's frequency.
70 * @opp_table: points back to the opp_table struct this opp belongs to 67 * @opp_table: points back to the opp_table struct this opp belongs to
@@ -83,10 +80,8 @@ struct dev_pm_opp {
83 bool suspend; 80 bool suspend;
84 unsigned long rate; 81 unsigned long rate;
85 82
86 unsigned long u_volt; 83 struct dev_pm_opp_supply *supplies;
87 unsigned long u_volt_min; 84
88 unsigned long u_volt_max;
89 unsigned long u_amp;
90 unsigned long clock_latency_ns; 85 unsigned long clock_latency_ns;
91 86
92 struct opp_table *opp_table; 87 struct opp_table *opp_table;
@@ -144,7 +139,10 @@ enum opp_table_access {
144 * @supported_hw_count: Number of elements in supported_hw array. 139 * @supported_hw_count: Number of elements in supported_hw array.
145 * @prop_name: A name to postfix to many DT properties, while parsing them. 140 * @prop_name: A name to postfix to many DT properties, while parsing them.
146 * @clk: Device's clock handle 141 * @clk: Device's clock handle
147 * @regulator: Supply regulator 142 * @regulators: Supply regulators
143 * @regulator_count: Number of power supply regulators
144 * @set_opp: Platform specific set_opp callback
145 * @set_opp_data: Data to be passed to set_opp callback
148 * @dentry: debugfs dentry pointer of the real device directory (not links). 146 * @dentry: debugfs dentry pointer of the real device directory (not links).
149 * @dentry_name: Name of the real dentry. 147 * @dentry_name: Name of the real dentry.
150 * 148 *
@@ -179,7 +177,11 @@ struct opp_table {
179 unsigned int supported_hw_count; 177 unsigned int supported_hw_count;
180 const char *prop_name; 178 const char *prop_name;
181 struct clk *clk; 179 struct clk *clk;
182 struct regulator *regulator; 180 struct regulator **regulators;
181 unsigned int regulator_count;
182
183 int (*set_opp)(struct dev_pm_set_opp_data *data);
184 struct dev_pm_set_opp_data *set_opp_data;
183 185
184#ifdef CONFIG_DEBUG_FS 186#ifdef CONFIG_DEBUG_FS
185 struct dentry *dentry; 187 struct dentry *dentry;
@@ -190,7 +192,6 @@ struct opp_table {
190/* Routines internal to opp core */ 192/* Routines internal to opp core */
191struct opp_table *_find_opp_table(struct device *dev); 193struct opp_table *_find_opp_table(struct device *dev);
192struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table); 194struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
193struct device_node *_of_get_opp_desc_node(struct device *dev);
194void _dev_pm_opp_remove_table(struct device *dev, bool remove_all); 195void _dev_pm_opp_remove_table(struct device *dev, bool remove_all);
195struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table); 196struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table);
196int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table); 197int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 50e30e7b059d..a84332aefc2d 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -21,14 +21,22 @@ extern void pm_runtime_init(struct device *dev);
21extern void pm_runtime_reinit(struct device *dev); 21extern void pm_runtime_reinit(struct device *dev);
22extern void pm_runtime_remove(struct device *dev); 22extern void pm_runtime_remove(struct device *dev);
23 23
24#define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0)
25#define WAKE_IRQ_DEDICATED_MANAGED BIT(1)
26#define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \
27 WAKE_IRQ_DEDICATED_MANAGED)
28
24struct wake_irq { 29struct wake_irq {
25 struct device *dev; 30 struct device *dev;
31 unsigned int status;
26 int irq; 32 int irq;
27 bool dedicated_irq:1;
28}; 33};
29 34
30extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); 35extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
31extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); 36extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
37extern void dev_pm_enable_wake_irq_check(struct device *dev,
38 bool can_change_status);
39extern void dev_pm_disable_wake_irq_check(struct device *dev);
32 40
33#ifdef CONFIG_PM_SLEEP 41#ifdef CONFIG_PM_SLEEP
34 42
@@ -104,6 +112,15 @@ static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
104{ 112{
105} 113}
106 114
115static inline void dev_pm_enable_wake_irq_check(struct device *dev,
116 bool can_change_status)
117{
118}
119
120static inline void dev_pm_disable_wake_irq_check(struct device *dev)
121{
122}
123
107#endif 124#endif
108 125
109#ifdef CONFIG_PM_SLEEP 126#ifdef CONFIG_PM_SLEEP
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7f3646e459cb..58fcc758334e 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -856,7 +856,10 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
856 struct dev_pm_qos_request *req; 856 struct dev_pm_qos_request *req;
857 857
858 if (val < 0) { 858 if (val < 0) {
859 ret = -EINVAL; 859 if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
860 ret = 0;
861 else
862 ret = -EINVAL;
860 goto out; 863 goto out;
861 } 864 }
862 req = kzalloc(sizeof(*req), GFP_KERNEL); 865 req = kzalloc(sizeof(*req), GFP_KERNEL);
@@ -883,6 +886,7 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
883 mutex_unlock(&dev_pm_qos_mtx); 886 mutex_unlock(&dev_pm_qos_mtx);
884 return ret; 887 return ret;
885} 888}
889EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);
886 890
887/** 891/**
888 * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace 892 * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 82a081ea4317..26856d050037 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -241,7 +241,8 @@ static int rpm_check_suspend_allowed(struct device *dev)
241 retval = -EACCES; 241 retval = -EACCES;
242 else if (atomic_read(&dev->power.usage_count) > 0) 242 else if (atomic_read(&dev->power.usage_count) > 0)
243 retval = -EAGAIN; 243 retval = -EAGAIN;
244 else if (!pm_children_suspended(dev)) 244 else if (!dev->power.ignore_children &&
245 atomic_read(&dev->power.child_count))
245 retval = -EBUSY; 246 retval = -EBUSY;
246 247
247 /* Pending resume requests take precedence over suspends. */ 248 /* Pending resume requests take precedence over suspends. */
@@ -515,7 +516,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
515 516
516 callback = RPM_GET_CALLBACK(dev, runtime_suspend); 517 callback = RPM_GET_CALLBACK(dev, runtime_suspend);
517 518
518 dev_pm_enable_wake_irq(dev); 519 dev_pm_enable_wake_irq_check(dev, true);
519 retval = rpm_callback(callback, dev); 520 retval = rpm_callback(callback, dev);
520 if (retval) 521 if (retval)
521 goto fail; 522 goto fail;
@@ -554,7 +555,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
554 return retval; 555 return retval;
555 556
556 fail: 557 fail:
557 dev_pm_disable_wake_irq(dev); 558 dev_pm_disable_wake_irq_check(dev);
558 __update_runtime_status(dev, RPM_ACTIVE); 559 __update_runtime_status(dev, RPM_ACTIVE);
559 dev->power.deferred_resume = false; 560 dev->power.deferred_resume = false;
560 wake_up_all(&dev->power.wait_queue); 561 wake_up_all(&dev->power.wait_queue);
@@ -712,8 +713,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
712 713
713 spin_lock(&parent->power.lock); 714 spin_lock(&parent->power.lock);
714 /* 715 /*
715 * We can resume if the parent's runtime PM is disabled or it 716 * Resume the parent if it has runtime PM enabled and not been
716 * is set to ignore children. 717 * set to ignore its children.
717 */ 718 */
718 if (!parent->power.disable_depth 719 if (!parent->power.disable_depth
719 && !parent->power.ignore_children) { 720 && !parent->power.ignore_children) {
@@ -737,12 +738,12 @@ static int rpm_resume(struct device *dev, int rpmflags)
737 738
738 callback = RPM_GET_CALLBACK(dev, runtime_resume); 739 callback = RPM_GET_CALLBACK(dev, runtime_resume);
739 740
740 dev_pm_disable_wake_irq(dev); 741 dev_pm_disable_wake_irq_check(dev);
741 retval = rpm_callback(callback, dev); 742 retval = rpm_callback(callback, dev);
742 if (retval) { 743 if (retval) {
743 __update_runtime_status(dev, RPM_SUSPENDED); 744 __update_runtime_status(dev, RPM_SUSPENDED);
744 pm_runtime_cancel_pending(dev); 745 pm_runtime_cancel_pending(dev);
745 dev_pm_enable_wake_irq(dev); 746 dev_pm_enable_wake_irq_check(dev, false);
746 } else { 747 } else {
747 no_callback: 748 no_callback:
748 __update_runtime_status(dev, RPM_ACTIVE); 749 __update_runtime_status(dev, RPM_ACTIVE);
@@ -1027,7 +1028,17 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
1027 goto out_set; 1028 goto out_set;
1028 1029
1029 if (status == RPM_SUSPENDED) { 1030 if (status == RPM_SUSPENDED) {
1030 /* It always is possible to set the status to 'suspended'. */ 1031 /*
1032 * It is invalid to suspend a device with an active child,
1033 * unless it has been set to ignore its children.
1034 */
1035 if (!dev->power.ignore_children &&
1036 atomic_read(&dev->power.child_count)) {
1037 dev_err(dev, "runtime PM trying to suspend device but active child\n");
1038 error = -EBUSY;
1039 goto out;
1040 }
1041
1031 if (parent) { 1042 if (parent) {
1032 atomic_add_unless(&parent->power.child_count, -1, 0); 1043 atomic_add_unless(&parent->power.child_count, -1, 0);
1033 notify_parent = !parent->power.ignore_children; 1044 notify_parent = !parent->power.ignore_children;
@@ -1478,6 +1489,16 @@ int pm_runtime_force_suspend(struct device *dev)
1478 if (ret) 1489 if (ret)
1479 goto err; 1490 goto err;
1480 1491
1492 /*
1493 * Increase the runtime PM usage count for the device's parent, in case
1494 * when we find the device being used when system suspend was invoked.
1495 * This informs pm_runtime_force_resume() to resume the parent
1496 * immediately, which is needed to be able to resume its children,
1497 * when not deferring the resume to be managed via runtime PM.
1498 */
1499 if (dev->parent && atomic_read(&dev->power.usage_count) > 1)
1500 pm_runtime_get_noresume(dev->parent);
1501
1481 pm_runtime_set_suspended(dev); 1502 pm_runtime_set_suspended(dev);
1482 return 0; 1503 return 0;
1483err: 1504err:
@@ -1487,16 +1508,20 @@ err:
1487EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); 1508EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
1488 1509
1489/** 1510/**
1490 * pm_runtime_force_resume - Force a device into resume state. 1511 * pm_runtime_force_resume - Force a device into resume state if needed.
1491 * @dev: Device to resume. 1512 * @dev: Device to resume.
1492 * 1513 *
1493 * Prior invoking this function we expect the user to have brought the device 1514 * Prior invoking this function we expect the user to have brought the device
1494 * into low power state by a call to pm_runtime_force_suspend(). Here we reverse 1515 * into low power state by a call to pm_runtime_force_suspend(). Here we reverse
1495 * those actions and brings the device into full power. We update the runtime PM 1516 * those actions and brings the device into full power, if it is expected to be
1496 * status and re-enables runtime PM. 1517 * used on system resume. To distinguish that, we check whether the runtime PM
1518 * usage count is greater than 1 (the PM core increases the usage count in the
1519 * system PM prepare phase), as that indicates a real user (such as a subsystem,
1520 * driver, userspace, etc.) is using it. If that is the case, the device is
1521 * expected to be used on system resume as well, so then we resume it. In the
1522 * other case, we defer the resume to be managed via runtime PM.
1497 * 1523 *
1498 * Typically this function may be invoked from a system resume callback to make 1524 * Typically this function may be invoked from a system resume callback.
1499 * sure the device is put into full power state.
1500 */ 1525 */
1501int pm_runtime_force_resume(struct device *dev) 1526int pm_runtime_force_resume(struct device *dev)
1502{ 1527{
@@ -1513,6 +1538,17 @@ int pm_runtime_force_resume(struct device *dev)
1513 if (!pm_runtime_status_suspended(dev)) 1538 if (!pm_runtime_status_suspended(dev))
1514 goto out; 1539 goto out;
1515 1540
1541 /*
1542 * Decrease the parent's runtime PM usage count, if we increased it
1543 * during system suspend in pm_runtime_force_suspend().
1544 */
1545 if (atomic_read(&dev->power.usage_count) > 1) {
1546 if (dev->parent)
1547 pm_runtime_put_noidle(dev->parent);
1548 } else {
1549 goto out;
1550 }
1551
1516 ret = pm_runtime_set_active(dev); 1552 ret = pm_runtime_set_active(dev);
1517 if (ret) 1553 if (ret)
1518 goto out; 1554 goto out;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index a7b46798c81d..33b4b902741a 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -263,7 +263,11 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
263 s32 value; 263 s32 value;
264 int ret; 264 int ret;
265 265
266 if (kstrtos32(buf, 0, &value)) { 266 if (kstrtos32(buf, 0, &value) == 0) {
267 /* Users can't write negative values directly */
268 if (value < 0)
269 return -EINVAL;
270 } else {
267 if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n")) 271 if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
268 value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; 272 value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
269 else if (!strcmp(buf, "any") || !strcmp(buf, "any\n")) 273 else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index 0d77cd6fd8d1..404d94c6c8bc 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -110,8 +110,10 @@ void dev_pm_clear_wake_irq(struct device *dev)
110 dev->power.wakeirq = NULL; 110 dev->power.wakeirq = NULL;
111 spin_unlock_irqrestore(&dev->power.lock, flags); 111 spin_unlock_irqrestore(&dev->power.lock, flags);
112 112
113 if (wirq->dedicated_irq) 113 if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) {
114 free_irq(wirq->irq, wirq); 114 free_irq(wirq->irq, wirq);
115 wirq->status &= ~WAKE_IRQ_DEDICATED_MASK;
116 }
115 kfree(wirq); 117 kfree(wirq);
116} 118}
117EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq); 119EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);
@@ -179,7 +181,6 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
179 181
180 wirq->dev = dev; 182 wirq->dev = dev;
181 wirq->irq = irq; 183 wirq->irq = irq;
182 wirq->dedicated_irq = true;
183 irq_set_status_flags(irq, IRQ_NOAUTOEN); 184 irq_set_status_flags(irq, IRQ_NOAUTOEN);
184 185
185 /* 186 /*
@@ -195,6 +196,8 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
195 if (err) 196 if (err)
196 goto err_free_irq; 197 goto err_free_irq;
197 198
199 wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED;
200
198 return err; 201 return err;
199 202
200err_free_irq: 203err_free_irq:
@@ -210,9 +213,9 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);
210 * dev_pm_enable_wake_irq - Enable device wake-up interrupt 213 * dev_pm_enable_wake_irq - Enable device wake-up interrupt
211 * @dev: Device 214 * @dev: Device
212 * 215 *
213 * Called from the bus code or the device driver for 216 * Optionally called from the bus code or the device driver for
214 * runtime_suspend() to enable the wake-up interrupt while 217 * runtime_resume() to override the PM runtime core managed wake-up
215 * the device is running. 218 * interrupt handling to enable the wake-up interrupt.
216 * 219 *
217 * Note that for runtime_suspend()) the wake-up interrupts 220 * Note that for runtime_suspend()) the wake-up interrupts
218 * should be unconditionally enabled unlike for suspend() 221 * should be unconditionally enabled unlike for suspend()
@@ -222,7 +225,7 @@ void dev_pm_enable_wake_irq(struct device *dev)
222{ 225{
223 struct wake_irq *wirq = dev->power.wakeirq; 226 struct wake_irq *wirq = dev->power.wakeirq;
224 227
225 if (wirq && wirq->dedicated_irq) 228 if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
226 enable_irq(wirq->irq); 229 enable_irq(wirq->irq);
227} 230}
228EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq); 231EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
@@ -231,20 +234,73 @@ EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
231 * dev_pm_disable_wake_irq - Disable device wake-up interrupt 234 * dev_pm_disable_wake_irq - Disable device wake-up interrupt
232 * @dev: Device 235 * @dev: Device
233 * 236 *
234 * Called from the bus code or the device driver for 237 * Optionally called from the bus code or the device driver for
235 * runtime_resume() to disable the wake-up interrupt while 238 * runtime_suspend() to override the PM runtime core managed wake-up
236 * the device is running. 239 * interrupt handling to disable the wake-up interrupt.
237 */ 240 */
238void dev_pm_disable_wake_irq(struct device *dev) 241void dev_pm_disable_wake_irq(struct device *dev)
239{ 242{
240 struct wake_irq *wirq = dev->power.wakeirq; 243 struct wake_irq *wirq = dev->power.wakeirq;
241 244
242 if (wirq && wirq->dedicated_irq) 245 if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
243 disable_irq_nosync(wirq->irq); 246 disable_irq_nosync(wirq->irq);
244} 247}
245EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); 248EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
246 249
247/** 250/**
251 * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt
252 * @dev: Device
253 * @can_change_status: Can change wake-up interrupt status
254 *
255 * Enables wakeirq conditionally. We need to enable wake-up interrupt
256 * lazily on the first rpm_suspend(). This is needed as the consumer device
257 * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
258 * otherwise try to disable already disabled wakeirq. The wake-up interrupt
259 * starts disabled with IRQ_NOAUTOEN set.
260 *
261 * Should be only called from rpm_suspend() and rpm_resume() path.
262 * Caller must hold &dev->power.lock to change wirq->status
263 */
264void dev_pm_enable_wake_irq_check(struct device *dev,
265 bool can_change_status)
266{
267 struct wake_irq *wirq = dev->power.wakeirq;
268
269 if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
270 return;
271
272 if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) {
273 goto enable;
274 } else if (can_change_status) {
275 wirq->status |= WAKE_IRQ_DEDICATED_MANAGED;
276 goto enable;
277 }
278
279 return;
280
281enable:
282 enable_irq(wirq->irq);
283}
284
285/**
286 * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt
287 * @dev: Device
288 *
289 * Disables wake-up interrupt conditionally based on status.
290 * Should be only called from rpm_suspend() and rpm_resume() path.
291 */
292void dev_pm_disable_wake_irq_check(struct device *dev)
293{
294 struct wake_irq *wirq = dev->power.wakeirq;
295
296 if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
297 return;
298
299 if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED)
300 disable_irq_nosync(wirq->irq);
301}
302
303/**
248 * dev_pm_arm_wake_irq - Arm device wake-up 304 * dev_pm_arm_wake_irq - Arm device wake-up
249 * @wirq: Device wake-up interrupt 305 * @wirq: Device wake-up interrupt
250 * 306 *
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 62e4de2aa8d1..bf9ba26981a5 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void)
811 rcu_read_lock(); 811 rcu_read_lock();
812 list_for_each_entry_rcu(ws, &wakeup_sources, entry) { 812 list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
813 if (ws->active) { 813 if (ws->active) {
814 pr_info("active wakeup source: %s\n", ws->name); 814 pr_debug("active wakeup source: %s\n", ws->name);
815 active = 1; 815 active = 1;
816 } else if (!active && 816 } else if (!active &&
817 (!last_activity_ws || 817 (!last_activity_ws ||
@@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void)
822 } 822 }
823 823
824 if (!active && last_activity_ws) 824 if (!active && last_activity_ws)
825 pr_info("last active wakeup source: %s\n", 825 pr_debug("last active wakeup source: %s\n",
826 last_activity_ws->name); 826 last_activity_ws->name);
827 rcu_read_unlock(); 827 rcu_read_unlock();
828} 828}
@@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block)
905 split_counters(&cnt, &inpr); 905 split_counters(&cnt, &inpr);
906 if (inpr == 0 || signal_pending(current)) 906 if (inpr == 0 || signal_pending(current))
907 break; 907 break;
908 908 pm_print_active_wakeup_sources();
909 schedule(); 909 schedule();
910 } 910 }
911 finish_wait(&wakeup_count_wait_queue, &wait); 911 finish_wait(&wakeup_count_wait_queue, &wait);
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index d89b8afe23b6..920c469f3953 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -12,6 +12,27 @@ config ARM_BIG_LITTLE_CPUFREQ
12 help 12 help
13 This enables the Generic CPUfreq driver for ARM big.LITTLE platforms. 13 This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
14 14
15config ARM_BRCMSTB_AVS_CPUFREQ
16 tristate "Broadcom STB AVS CPUfreq driver"
17 depends on ARCH_BRCMSTB || COMPILE_TEST
18 default y
19 help
20 Some Broadcom STB SoCs use a co-processor running proprietary firmware
21 ("AVS") to handle voltage and frequency scaling. This driver provides
22 a standard CPUfreq interface to to the firmware.
23
24 Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS.
25
26config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
27 bool "Broadcom STB AVS CPUfreq driver sysfs debug capability"
28 depends on ARM_BRCMSTB_AVS_CPUFREQ
29 help
30 Enabling this option turns on debug support via sysfs under
31 /sys/kernel/debug/brcmstb-avs-cpufreq. It is possible to read all and
32 write some AVS mailbox registers through sysfs entries.
33
34 If in doubt, say N.
35
15config ARM_DT_BL_CPUFREQ 36config ARM_DT_BL_CPUFREQ
16 tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver" 37 tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
17 depends on ARM_BIG_LITTLE_CPUFREQ && OF 38 depends on ARM_BIG_LITTLE_CPUFREQ && OF
@@ -60,14 +81,6 @@ config ARM_IMX6Q_CPUFREQ
60 81
61 If in doubt, say N. 82 If in doubt, say N.
62 83
63config ARM_INTEGRATOR
64 tristate "CPUfreq driver for ARM Integrator CPUs"
65 depends on ARCH_INTEGRATOR
66 default y
67 help
68 This enables the CPUfreq driver for ARM Integrator CPUs.
69 If in doubt, say Y.
70
71config ARM_KIRKWOOD_CPUFREQ 84config ARM_KIRKWOOD_CPUFREQ
72 def_bool MACH_KIRKWOOD 85 def_bool MACH_KIRKWOOD
73 help 86 help
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a093646..1e46c3918e7a 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -51,12 +51,12 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ) += arm_big_little.o
51# LITTLE drivers, so that it is probed last. 51# LITTLE drivers, so that it is probed last.
52obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o 52obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o
53 53
54obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ) += brcmstb-avs-cpufreq.o
54obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o 55obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o
55obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o 56obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o
56obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o 57obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o
57obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o 58obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o
58obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o 59obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o
59obj-$(CONFIG_ARM_INTEGRATOR) += integrator-cpufreq.o
60obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ) += kirkwood-cpufreq.o 60obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ) += kirkwood-cpufreq.o
61obj-$(CONFIG_ARM_MT8173_CPUFREQ) += mt8173-cpufreq.o 61obj-$(CONFIG_ARM_MT8173_CPUFREQ) += mt8173-cpufreq.o
62obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ) += omap-cpufreq.o 62obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ) += omap-cpufreq.o
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 297e9128fe9f..3a98702b7445 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -84,7 +84,6 @@ static inline struct acpi_processor_performance *to_perf_data(struct acpi_cpufre
84static struct cpufreq_driver acpi_cpufreq_driver; 84static struct cpufreq_driver acpi_cpufreq_driver;
85 85
86static unsigned int acpi_pstate_strict; 86static unsigned int acpi_pstate_strict;
87static struct msr __percpu *msrs;
88 87
89static bool boost_state(unsigned int cpu) 88static bool boost_state(unsigned int cpu)
90{ 89{
@@ -104,11 +103,10 @@ static bool boost_state(unsigned int cpu)
104 return false; 103 return false;
105} 104}
106 105
107static void boost_set_msrs(bool enable, const struct cpumask *cpumask) 106static int boost_set_msr(bool enable)
108{ 107{
109 u32 cpu;
110 u32 msr_addr; 108 u32 msr_addr;
111 u64 msr_mask; 109 u64 msr_mask, val;
112 110
113 switch (boot_cpu_data.x86_vendor) { 111 switch (boot_cpu_data.x86_vendor) {
114 case X86_VENDOR_INTEL: 112 case X86_VENDOR_INTEL:
@@ -120,26 +118,31 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
120 msr_mask = MSR_K7_HWCR_CPB_DIS; 118 msr_mask = MSR_K7_HWCR_CPB_DIS;
121 break; 119 break;
122 default: 120 default:
123 return; 121 return -EINVAL;
124 } 122 }
125 123
126 rdmsr_on_cpus(cpumask, msr_addr, msrs); 124 rdmsrl(msr_addr, val);
127 125
128 for_each_cpu(cpu, cpumask) { 126 if (enable)
129 struct msr *reg = per_cpu_ptr(msrs, cpu); 127 val &= ~msr_mask;
130 if (enable) 128 else
131 reg->q &= ~msr_mask; 129 val |= msr_mask;
132 else 130
133 reg->q |= msr_mask; 131 wrmsrl(msr_addr, val);
134 } 132 return 0;
133}
134
135static void boost_set_msr_each(void *p_en)
136{
137 bool enable = (bool) p_en;
135 138
136 wrmsr_on_cpus(cpumask, msr_addr, msrs); 139 boost_set_msr(enable);
137} 140}
138 141
139static int set_boost(int val) 142static int set_boost(int val)
140{ 143{
141 get_online_cpus(); 144 get_online_cpus();
142 boost_set_msrs(val, cpu_online_mask); 145 on_each_cpu(boost_set_msr_each, (void *)(long)val, 1);
143 put_online_cpus(); 146 put_online_cpus();
144 pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis"); 147 pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis");
145 148
@@ -536,46 +539,24 @@ static void free_acpi_perf_data(void)
536 free_percpu(acpi_perf_data); 539 free_percpu(acpi_perf_data);
537} 540}
538 541
539static int boost_notify(struct notifier_block *nb, unsigned long action, 542static int cpufreq_boost_online(unsigned int cpu)
540 void *hcpu)
541{ 543{
542 unsigned cpu = (long)hcpu; 544 /*
543 const struct cpumask *cpumask; 545 * On the CPU_UP path we simply keep the boost-disable flag
544 546 * in sync with the current global state.
545 cpumask = get_cpu_mask(cpu); 547 */
548 return boost_set_msr(acpi_cpufreq_driver.boost_enabled);
549}
546 550
551static int cpufreq_boost_down_prep(unsigned int cpu)
552{
547 /* 553 /*
548 * Clear the boost-disable bit on the CPU_DOWN path so that 554 * Clear the boost-disable bit on the CPU_DOWN path so that
549 * this cpu cannot block the remaining ones from boosting. On 555 * this cpu cannot block the remaining ones from boosting.
550 * the CPU_UP path we simply keep the boost-disable flag in
551 * sync with the current global state.
552 */ 556 */
553 557 return boost_set_msr(1);
554 switch (action) {
555 case CPU_DOWN_FAILED:
556 case CPU_DOWN_FAILED_FROZEN:
557 case CPU_ONLINE:
558 case CPU_ONLINE_FROZEN:
559 boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
560 break;
561
562 case CPU_DOWN_PREPARE:
563 case CPU_DOWN_PREPARE_FROZEN:
564 boost_set_msrs(1, cpumask);
565 break;
566
567 default:
568 break;
569 }
570
571 return NOTIFY_OK;
572} 558}
573 559
574
575static struct notifier_block boost_nb = {
576 .notifier_call = boost_notify,
577};
578
579/* 560/*
580 * acpi_cpufreq_early_init - initialize ACPI P-States library 561 * acpi_cpufreq_early_init - initialize ACPI P-States library
581 * 562 *
@@ -922,37 +903,35 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
922 .attr = acpi_cpufreq_attr, 903 .attr = acpi_cpufreq_attr,
923}; 904};
924 905
906static enum cpuhp_state acpi_cpufreq_online;
907
925static void __init acpi_cpufreq_boost_init(void) 908static void __init acpi_cpufreq_boost_init(void)
926{ 909{
927 if (boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)) { 910 int ret;
928 msrs = msrs_alloc();
929
930 if (!msrs)
931 return;
932
933 acpi_cpufreq_driver.set_boost = set_boost;
934 acpi_cpufreq_driver.boost_enabled = boost_state(0);
935
936 cpu_notifier_register_begin();
937 911
938 /* Force all MSRs to the same value */ 912 if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
939 boost_set_msrs(acpi_cpufreq_driver.boost_enabled, 913 return;
940 cpu_online_mask);
941 914
942 __register_cpu_notifier(&boost_nb); 915 acpi_cpufreq_driver.set_boost = set_boost;
916 acpi_cpufreq_driver.boost_enabled = boost_state(0);
943 917
944 cpu_notifier_register_done(); 918 /*
919 * This calls the online callback on all online cpu and forces all
920 * MSRs to the same value.
921 */
922 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online",
923 cpufreq_boost_online, cpufreq_boost_down_prep);
924 if (ret < 0) {
925 pr_err("acpi_cpufreq: failed to register hotplug callbacks\n");
926 return;
945 } 927 }
928 acpi_cpufreq_online = ret;
946} 929}
947 930
948static void acpi_cpufreq_boost_exit(void) 931static void acpi_cpufreq_boost_exit(void)
949{ 932{
950 if (msrs) { 933 if (acpi_cpufreq_online >= 0)
951 unregister_cpu_notifier(&boost_nb); 934 cpuhp_remove_state_nocalls(acpi_cpufreq_online);
952
953 msrs_free(msrs);
954 msrs = NULL;
955 }
956} 935}
957 936
958static int __init acpi_cpufreq_init(void) 937static int __init acpi_cpufreq_init(void)
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
new file mode 100644
index 000000000000..4fda623e55bb
--- /dev/null
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -0,0 +1,1057 @@
1/*
2 * CPU frequency scaling for Broadcom SoCs with AVS firmware that
3 * supports DVS or DVFS
4 *
5 * Copyright (c) 2016 Broadcom
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation version 2.
10 *
11 * This program is distributed "as is" WITHOUT ANY WARRANTY of any
12 * kind, whether express or implied; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17/*
18 * "AVS" is the name of a firmware developed at Broadcom. It derives
19 * its name from the technique called "Adaptive Voltage Scaling".
20 * Adaptive voltage scaling was the original purpose of this firmware.
21 * The AVS firmware still supports "AVS mode", where all it does is
22 * adaptive voltage scaling. However, on some newer Broadcom SoCs, the
23 * AVS Firmware, despite its unchanged name, also supports DFS mode and
24 * DVFS mode.
25 *
26 * In the context of this document and the related driver, "AVS" by
27 * itself always means the Broadcom firmware and never refers to the
28 * technique called "Adaptive Voltage Scaling".
29 *
30 * The Broadcom STB AVS CPUfreq driver provides voltage and frequency
31 * scaling on Broadcom SoCs using AVS firmware with support for DFS and
32 * DVFS. The AVS firmware is running on its own co-processor. The
33 * driver supports both uniprocessor (UP) and symmetric multiprocessor
34 * (SMP) systems which share clock and voltage across all CPUs.
35 *
36 * Actual voltage and frequency scaling is done solely by the AVS
37 * firmware. This driver does not change frequency or voltage itself.
38 * It provides a standard CPUfreq interface to the rest of the kernel
39 * and to userland. It interfaces with the AVS firmware to effect the
40 * requested changes and to report back the current system status in a
41 * way that is expected by existing tools.
42 */
43
44#include <linux/cpufreq.h>
45#include <linux/interrupt.h>
46#include <linux/io.h>
47#include <linux/module.h>
48#include <linux/of_address.h>
49#include <linux/platform_device.h>
50#include <linux/semaphore.h>
51
52#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
53#include <linux/ctype.h>
54#include <linux/debugfs.h>
55#include <linux/slab.h>
56#include <linux/uaccess.h>
57#endif
58
59/* Max number of arguments AVS calls take */
60#define AVS_MAX_CMD_ARGS 4
61/*
62 * This macro is used to generate AVS parameter register offsets. For
63 * x >= AVS_MAX_CMD_ARGS, it returns 0 to protect against accidental memory
64 * access outside of the parameter range. (Offset 0 is the first parameter.)
65 */
66#define AVS_PARAM_MULT(x) ((x) < AVS_MAX_CMD_ARGS ? (x) : 0)
67
68/* AVS Mailbox Register offsets */
69#define AVS_MBOX_COMMAND 0x00
70#define AVS_MBOX_STATUS 0x04
71#define AVS_MBOX_VOLTAGE0 0x08
72#define AVS_MBOX_TEMP0 0x0c
73#define AVS_MBOX_PV0 0x10
74#define AVS_MBOX_MV0 0x14
75#define AVS_MBOX_PARAM(x) (0x18 + AVS_PARAM_MULT(x) * sizeof(u32))
76#define AVS_MBOX_REVISION 0x28
77#define AVS_MBOX_PSTATE 0x2c
78#define AVS_MBOX_HEARTBEAT 0x30
79#define AVS_MBOX_MAGIC 0x34
80#define AVS_MBOX_SIGMA_HVT 0x38
81#define AVS_MBOX_SIGMA_SVT 0x3c
82#define AVS_MBOX_VOLTAGE1 0x40
83#define AVS_MBOX_TEMP1 0x44
84#define AVS_MBOX_PV1 0x48
85#define AVS_MBOX_MV1 0x4c
86#define AVS_MBOX_FREQUENCY 0x50
87
88/* AVS Commands */
89#define AVS_CMD_AVAILABLE 0x00
90#define AVS_CMD_DISABLE 0x10
91#define AVS_CMD_ENABLE 0x11
92#define AVS_CMD_S2_ENTER 0x12
93#define AVS_CMD_S2_EXIT 0x13
94#define AVS_CMD_BBM_ENTER 0x14
95#define AVS_CMD_BBM_EXIT 0x15
96#define AVS_CMD_S3_ENTER 0x16
97#define AVS_CMD_S3_EXIT 0x17
98#define AVS_CMD_BALANCE 0x18
99/* PMAP and P-STATE commands */
100#define AVS_CMD_GET_PMAP 0x30
101#define AVS_CMD_SET_PMAP 0x31
102#define AVS_CMD_GET_PSTATE 0x40
103#define AVS_CMD_SET_PSTATE 0x41
104
105/* Different modes AVS supports (for GET_PMAP/SET_PMAP) */
106#define AVS_MODE_AVS 0x0
107#define AVS_MODE_DFS 0x1
108#define AVS_MODE_DVS 0x2
109#define AVS_MODE_DVFS 0x3
110
111/*
112 * PMAP parameter p1
113 * unused:31-24, mdiv_p0:23-16, unused:15-14, pdiv:13-10 , ndiv_int:9-0
114 */
115#define NDIV_INT_SHIFT 0
116#define NDIV_INT_MASK 0x3ff
117#define PDIV_SHIFT 10
118#define PDIV_MASK 0xf
119#define MDIV_P0_SHIFT 16
120#define MDIV_P0_MASK 0xff
121/*
122 * PMAP parameter p2
123 * mdiv_p4:31-24, mdiv_p3:23-16, mdiv_p2:15:8, mdiv_p1:7:0
124 */
125#define MDIV_P1_SHIFT 0
126#define MDIV_P1_MASK 0xff
127#define MDIV_P2_SHIFT 8
128#define MDIV_P2_MASK 0xff
129#define MDIV_P3_SHIFT 16
130#define MDIV_P3_MASK 0xff
131#define MDIV_P4_SHIFT 24
132#define MDIV_P4_MASK 0xff
133
134/* Different P-STATES AVS supports (for GET_PSTATE/SET_PSTATE) */
135#define AVS_PSTATE_P0 0x0
136#define AVS_PSTATE_P1 0x1
137#define AVS_PSTATE_P2 0x2
138#define AVS_PSTATE_P3 0x3
139#define AVS_PSTATE_P4 0x4
140#define AVS_PSTATE_MAX AVS_PSTATE_P4
141
142/* CPU L2 Interrupt Controller Registers */
143#define AVS_CPU_L2_SET0 0x04
144#define AVS_CPU_L2_INT_MASK BIT(31)
145
146/* AVS Command Status Values */
147#define AVS_STATUS_CLEAR 0x00
148/* Command/notification accepted */
149#define AVS_STATUS_SUCCESS 0xf0
150/* Command/notification rejected */
151#define AVS_STATUS_FAILURE 0xff
152/* Invalid command/notification (unknown) */
153#define AVS_STATUS_INVALID 0xf1
154/* Non-AVS modes are not supported */
155#define AVS_STATUS_NO_SUPP 0xf2
156/* Cannot set P-State until P-Map supplied */
157#define AVS_STATUS_NO_MAP 0xf3
158/* Cannot change P-Map after initial P-Map set */
159#define AVS_STATUS_MAP_SET 0xf4
160/* Max AVS status; higher numbers are used for debugging */
161#define AVS_STATUS_MAX 0xff
162
163/* Other AVS related constants */
164#define AVS_LOOP_LIMIT 10000
165#define AVS_TIMEOUT 300 /* in ms; expected completion is < 10ms */
166#define AVS_FIRMWARE_MAGIC 0xa11600d1
167
168#define BRCM_AVS_CPUFREQ_PREFIX "brcmstb-avs"
169#define BRCM_AVS_CPUFREQ_NAME BRCM_AVS_CPUFREQ_PREFIX "-cpufreq"
170#define BRCM_AVS_CPU_DATA "brcm,avs-cpu-data-mem"
171#define BRCM_AVS_CPU_INTR "brcm,avs-cpu-l2-intr"
172#define BRCM_AVS_HOST_INTR "sw_intr"
173
174struct pmap {
175 unsigned int mode;
176 unsigned int p1;
177 unsigned int p2;
178 unsigned int state;
179};
180
181struct private_data {
182 void __iomem *base;
183 void __iomem *avs_intr_base;
184 struct device *dev;
185#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
186 struct dentry *debugfs;
187#endif
188 struct completion done;
189 struct semaphore sem;
190 struct pmap pmap;
191};
192
193#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
194
195enum debugfs_format {
196 DEBUGFS_NORMAL,
197 DEBUGFS_FLOAT,
198 DEBUGFS_REV,
199};
200
201struct debugfs_data {
202 struct debugfs_entry *entry;
203 struct private_data *priv;
204};
205
206struct debugfs_entry {
207 char *name;
208 u32 offset;
209 fmode_t mode;
210 enum debugfs_format format;
211};
212
213#define DEBUGFS_ENTRY(name, mode, format) { \
214 #name, AVS_MBOX_##name, mode, format \
215}
216
217/*
218 * These are used for debugfs only. Otherwise we use AVS_MBOX_PARAM() directly.
219 */
220#define AVS_MBOX_PARAM1 AVS_MBOX_PARAM(0)
221#define AVS_MBOX_PARAM2 AVS_MBOX_PARAM(1)
222#define AVS_MBOX_PARAM3 AVS_MBOX_PARAM(2)
223#define AVS_MBOX_PARAM4 AVS_MBOX_PARAM(3)
224
225/*
226 * This table stores the name, access permissions and offset for each hardware
227 * register and is used to generate debugfs entries.
228 */
229static struct debugfs_entry debugfs_entries[] = {
230 DEBUGFS_ENTRY(COMMAND, S_IWUSR, DEBUGFS_NORMAL),
231 DEBUGFS_ENTRY(STATUS, S_IWUSR, DEBUGFS_NORMAL),
232 DEBUGFS_ENTRY(VOLTAGE0, 0, DEBUGFS_FLOAT),
233 DEBUGFS_ENTRY(TEMP0, 0, DEBUGFS_FLOAT),
234 DEBUGFS_ENTRY(PV0, 0, DEBUGFS_FLOAT),
235 DEBUGFS_ENTRY(MV0, 0, DEBUGFS_FLOAT),
236 DEBUGFS_ENTRY(PARAM1, S_IWUSR, DEBUGFS_NORMAL),
237 DEBUGFS_ENTRY(PARAM2, S_IWUSR, DEBUGFS_NORMAL),
238 DEBUGFS_ENTRY(PARAM3, S_IWUSR, DEBUGFS_NORMAL),
239 DEBUGFS_ENTRY(PARAM4, S_IWUSR, DEBUGFS_NORMAL),
240 DEBUGFS_ENTRY(REVISION, 0, DEBUGFS_REV),
241 DEBUGFS_ENTRY(PSTATE, 0, DEBUGFS_NORMAL),
242 DEBUGFS_ENTRY(HEARTBEAT, 0, DEBUGFS_NORMAL),
243 DEBUGFS_ENTRY(MAGIC, S_IWUSR, DEBUGFS_NORMAL),
244 DEBUGFS_ENTRY(SIGMA_HVT, 0, DEBUGFS_NORMAL),
245 DEBUGFS_ENTRY(SIGMA_SVT, 0, DEBUGFS_NORMAL),
246 DEBUGFS_ENTRY(VOLTAGE1, 0, DEBUGFS_FLOAT),
247 DEBUGFS_ENTRY(TEMP1, 0, DEBUGFS_FLOAT),
248 DEBUGFS_ENTRY(PV1, 0, DEBUGFS_FLOAT),
249 DEBUGFS_ENTRY(MV1, 0, DEBUGFS_FLOAT),
250 DEBUGFS_ENTRY(FREQUENCY, 0, DEBUGFS_NORMAL),
251};
252
253static int brcm_avs_target_index(struct cpufreq_policy *, unsigned int);
254
255static char *__strtolower(char *s)
256{
257 char *p;
258
259 for (p = s; *p; p++)
260 *p = tolower(*p);
261
262 return s;
263}
264
265#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
266
267static void __iomem *__map_region(const char *name)
268{
269 struct device_node *np;
270 void __iomem *ptr;
271
272 np = of_find_compatible_node(NULL, NULL, name);
273 if (!np)
274 return NULL;
275
276 ptr = of_iomap(np, 0);
277 of_node_put(np);
278
279 return ptr;
280}
281
282static int __issue_avs_command(struct private_data *priv, int cmd, bool is_send,
283 u32 args[])
284{
285 unsigned long time_left = msecs_to_jiffies(AVS_TIMEOUT);
286 void __iomem *base = priv->base;
287 unsigned int i;
288 int ret;
289 u32 val;
290
291 ret = down_interruptible(&priv->sem);
292 if (ret)
293 return ret;
294
295 /*
296 * Make sure no other command is currently running: cmd is 0 if AVS
297 * co-processor is idle. Due to the guard above, we should almost never
298 * have to wait here.
299 */
300 for (i = 0, val = 1; val != 0 && i < AVS_LOOP_LIMIT; i++)
301 val = readl(base + AVS_MBOX_COMMAND);
302
303 /* Give the caller a chance to retry if AVS is busy. */
304 if (i == AVS_LOOP_LIMIT) {
305 ret = -EAGAIN;
306 goto out;
307 }
308
309 /* Clear status before we begin. */
310 writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
311
312 /* We need to send arguments for this command. */
313 if (args && is_send) {
314 for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
315 writel(args[i], base + AVS_MBOX_PARAM(i));
316 }
317
318 /* Protect from spurious interrupts. */
319 reinit_completion(&priv->done);
320
321 /* Now issue the command & tell firmware to wake up to process it. */
322 writel(cmd, base + AVS_MBOX_COMMAND);
323 writel(AVS_CPU_L2_INT_MASK, priv->avs_intr_base + AVS_CPU_L2_SET0);
324
325 /* Wait for AVS co-processor to finish processing the command. */
326 time_left = wait_for_completion_timeout(&priv->done, time_left);
327
328 /*
329 * If the AVS status is not in the expected range, it means AVS didn't
330 * complete our command in time, and we return an error. Also, if there
331 * is no "time left", we timed out waiting for the interrupt.
332 */
333 val = readl(base + AVS_MBOX_STATUS);
334 if (time_left == 0 || val == 0 || val > AVS_STATUS_MAX) {
335 dev_err(priv->dev, "AVS command %#x didn't complete in time\n",
336 cmd);
337 dev_err(priv->dev, " Time left: %u ms, AVS status: %#x\n",
338 jiffies_to_msecs(time_left), val);
339 ret = -ETIMEDOUT;
340 goto out;
341 }
342
343 /* This command returned arguments, so we read them back. */
344 if (args && !is_send) {
345 for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
346 args[i] = readl(base + AVS_MBOX_PARAM(i));
347 }
348
349 /* Clear status to tell AVS co-processor we are done. */
350 writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
351
352 /* Convert firmware errors to errno's as much as possible. */
353 switch (val) {
354 case AVS_STATUS_INVALID:
355 ret = -EINVAL;
356 break;
357 case AVS_STATUS_NO_SUPP:
358 ret = -ENOTSUPP;
359 break;
360 case AVS_STATUS_NO_MAP:
361 ret = -ENOENT;
362 break;
363 case AVS_STATUS_MAP_SET:
364 ret = -EEXIST;
365 break;
366 case AVS_STATUS_FAILURE:
367 ret = -EIO;
368 break;
369 }
370
371out:
372 up(&priv->sem);
373
374 return ret;
375}
376
377static irqreturn_t irq_handler(int irq, void *data)
378{
379 struct private_data *priv = data;
380
381 /* AVS command completed execution. Wake up __issue_avs_command(). */
382 complete(&priv->done);
383
384 return IRQ_HANDLED;
385}
386
387static char *brcm_avs_mode_to_string(unsigned int mode)
388{
389 switch (mode) {
390 case AVS_MODE_AVS:
391 return "AVS";
392 case AVS_MODE_DFS:
393 return "DFS";
394 case AVS_MODE_DVS:
395 return "DVS";
396 case AVS_MODE_DVFS:
397 return "DVFS";
398 }
399 return NULL;
400}
401
402static void brcm_avs_parse_p1(u32 p1, unsigned int *mdiv_p0, unsigned int *pdiv,
403 unsigned int *ndiv)
404{
405 *mdiv_p0 = (p1 >> MDIV_P0_SHIFT) & MDIV_P0_MASK;
406 *pdiv = (p1 >> PDIV_SHIFT) & PDIV_MASK;
407 *ndiv = (p1 >> NDIV_INT_SHIFT) & NDIV_INT_MASK;
408}
409
410static void brcm_avs_parse_p2(u32 p2, unsigned int *mdiv_p1,
411 unsigned int *mdiv_p2, unsigned int *mdiv_p3,
412 unsigned int *mdiv_p4)
413{
414 *mdiv_p4 = (p2 >> MDIV_P4_SHIFT) & MDIV_P4_MASK;
415 *mdiv_p3 = (p2 >> MDIV_P3_SHIFT) & MDIV_P3_MASK;
416 *mdiv_p2 = (p2 >> MDIV_P2_SHIFT) & MDIV_P2_MASK;
417 *mdiv_p1 = (p2 >> MDIV_P1_SHIFT) & MDIV_P1_MASK;
418}
419
420static int brcm_avs_get_pmap(struct private_data *priv, struct pmap *pmap)
421{
422 u32 args[AVS_MAX_CMD_ARGS];
423 int ret;
424
425 ret = __issue_avs_command(priv, AVS_CMD_GET_PMAP, false, args);
426 if (ret || !pmap)
427 return ret;
428
429 pmap->mode = args[0];
430 pmap->p1 = args[1];
431 pmap->p2 = args[2];
432 pmap->state = args[3];
433
434 return 0;
435}
436
437static int brcm_avs_set_pmap(struct private_data *priv, struct pmap *pmap)
438{
439 u32 args[AVS_MAX_CMD_ARGS];
440
441 args[0] = pmap->mode;
442 args[1] = pmap->p1;
443 args[2] = pmap->p2;
444 args[3] = pmap->state;
445
446 return __issue_avs_command(priv, AVS_CMD_SET_PMAP, true, args);
447}
448
449static int brcm_avs_get_pstate(struct private_data *priv, unsigned int *pstate)
450{
451 u32 args[AVS_MAX_CMD_ARGS];
452 int ret;
453
454 ret = __issue_avs_command(priv, AVS_CMD_GET_PSTATE, false, args);
455 if (ret)
456 return ret;
457 *pstate = args[0];
458
459 return 0;
460}
461
462static int brcm_avs_set_pstate(struct private_data *priv, unsigned int pstate)
463{
464 u32 args[AVS_MAX_CMD_ARGS];
465
466 args[0] = pstate;
467
468 return __issue_avs_command(priv, AVS_CMD_SET_PSTATE, true, args);
469}
470
471static unsigned long brcm_avs_get_voltage(void __iomem *base)
472{
473 return readl(base + AVS_MBOX_VOLTAGE1);
474}
475
476static unsigned long brcm_avs_get_frequency(void __iomem *base)
477{
478 return readl(base + AVS_MBOX_FREQUENCY) * 1000; /* in kHz */
479}
480
481/*
482 * We determine which frequencies are supported by cycling through all P-states
483 * and reading back what frequency we are running at for each P-state.
484 */
485static struct cpufreq_frequency_table *
486brcm_avs_get_freq_table(struct device *dev, struct private_data *priv)
487{
488 struct cpufreq_frequency_table *table;
489 unsigned int pstate;
490 int i, ret;
491
492 /* Remember P-state for later */
493 ret = brcm_avs_get_pstate(priv, &pstate);
494 if (ret)
495 return ERR_PTR(ret);
496
497 table = devm_kzalloc(dev, (AVS_PSTATE_MAX + 1) * sizeof(*table),
498 GFP_KERNEL);
499 if (!table)
500 return ERR_PTR(-ENOMEM);
501
502 for (i = AVS_PSTATE_P0; i <= AVS_PSTATE_MAX; i++) {
503 ret = brcm_avs_set_pstate(priv, i);
504 if (ret)
505 return ERR_PTR(ret);
506 table[i].frequency = brcm_avs_get_frequency(priv->base);
507 table[i].driver_data = i;
508 }
509 table[i].frequency = CPUFREQ_TABLE_END;
510
511 /* Restore P-state */
512 ret = brcm_avs_set_pstate(priv, pstate);
513 if (ret)
514 return ERR_PTR(ret);
515
516 return table;
517}
518
519#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
520
521#define MANT(x) (unsigned int)(abs((x)) / 1000)
522#define FRAC(x) (unsigned int)(abs((x)) - abs((x)) / 1000 * 1000)
523
524static int brcm_avs_debug_show(struct seq_file *s, void *data)
525{
526 struct debugfs_data *dbgfs = s->private;
527 void __iomem *base;
528 u32 val, offset;
529
530 if (!dbgfs) {
531 seq_puts(s, "No device pointer\n");
532 return 0;
533 }
534
535 base = dbgfs->priv->base;
536 offset = dbgfs->entry->offset;
537 val = readl(base + offset);
538 switch (dbgfs->entry->format) {
539 case DEBUGFS_NORMAL:
540 seq_printf(s, "%u\n", val);
541 break;
542 case DEBUGFS_FLOAT:
543 seq_printf(s, "%d.%03d\n", MANT(val), FRAC(val));
544 break;
545 case DEBUGFS_REV:
546 seq_printf(s, "%c.%c.%c.%c\n", (val >> 24 & 0xff),
547 (val >> 16 & 0xff), (val >> 8 & 0xff),
548 val & 0xff);
549 break;
550 }
551 seq_printf(s, "0x%08x\n", val);
552
553 return 0;
554}
555
556#undef MANT
557#undef FRAC
558
559static ssize_t brcm_avs_seq_write(struct file *file, const char __user *buf,
560 size_t size, loff_t *ppos)
561{
562 struct seq_file *s = file->private_data;
563 struct debugfs_data *dbgfs = s->private;
564 struct private_data *priv = dbgfs->priv;
565 void __iomem *base, *avs_intr_base;
566 bool use_issue_command = false;
567 unsigned long val, offset;
568 char str[128];
569 int ret;
570 char *str_ptr = str;
571
572 if (size >= sizeof(str))
573 return -E2BIG;
574
575 memset(str, 0, sizeof(str));
576 ret = copy_from_user(str, buf, size);
577 if (ret)
578 return ret;
579
580 base = priv->base;
581 avs_intr_base = priv->avs_intr_base;
582 offset = dbgfs->entry->offset;
583 /*
584 * Special case writing to "command" entry only: if the string starts
585 * with a 'c', we use the driver's __issue_avs_command() function.
586 * Otherwise, we perform a raw write. This should allow testing of raw
587 * access as well as using the higher level function. (Raw access
588 * doesn't clear the firmware return status after issuing the command.)
589 */
590 if (str_ptr[0] == 'c' && offset == AVS_MBOX_COMMAND) {
591 use_issue_command = true;
592 str_ptr++;
593 }
594 if (kstrtoul(str_ptr, 0, &val) != 0)
595 return -EINVAL;
596
597 /*
598 * Setting the P-state is a special case. We need to update the CPU
599 * frequency we report.
600 */
601 if (val == AVS_CMD_SET_PSTATE) {
602 struct cpufreq_policy *policy;
603 unsigned int pstate;
604
605 policy = cpufreq_cpu_get(smp_processor_id());
606 /* Read back the P-state we are about to set */
607 pstate = readl(base + AVS_MBOX_PARAM(0));
608 if (use_issue_command) {
609 ret = brcm_avs_target_index(policy, pstate);
610 return ret ? ret : size;
611 }
612 policy->cur = policy->freq_table[pstate].frequency;
613 }
614
615 if (use_issue_command) {
616 ret = __issue_avs_command(priv, val, false, NULL);
617 } else {
618 /* Locking here is not perfect, but is only for debug. */
619 ret = down_interruptible(&priv->sem);
620 if (ret)
621 return ret;
622
623 writel(val, base + offset);
624 /* We have to wake up the firmware to process a command. */
625 if (offset == AVS_MBOX_COMMAND)
626 writel(AVS_CPU_L2_INT_MASK,
627 avs_intr_base + AVS_CPU_L2_SET0);
628 up(&priv->sem);
629 }
630
631 return ret ? ret : size;
632}
633
634static struct debugfs_entry *__find_debugfs_entry(const char *name)
635{
636 int i;
637
638 for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++)
639 if (strcasecmp(debugfs_entries[i].name, name) == 0)
640 return &debugfs_entries[i];
641
642 return NULL;
643}
644
645static int brcm_avs_debug_open(struct inode *inode, struct file *file)
646{
647 struct debugfs_data *data;
648 fmode_t fmode;
649 int ret;
650
651 /*
652 * seq_open(), which is called by single_open(), clears "write" access.
653 * We need write access to some files, so we preserve our access mode
654 * and restore it.
655 */
656 fmode = file->f_mode;
657 /*
658 * Check access permissions even for root. We don't want to be writing
659 * to read-only registers. Access for regular users has already been
660 * checked by the VFS layer.
661 */
662 if ((fmode & FMODE_WRITER) && !(inode->i_mode & S_IWUSR))
663 return -EACCES;
664
665 data = kmalloc(sizeof(*data), GFP_KERNEL);
666 if (!data)
667 return -ENOMEM;
668 /*
669 * We use the same file system operations for all our debug files. To
670 * produce specific output, we look up the file name upon opening a
671 * debugfs entry and map it to a memory offset. This offset is then used
672 * in the generic "show" function to read a specific register.
673 */
674 data->entry = __find_debugfs_entry(file->f_path.dentry->d_iname);
675 data->priv = inode->i_private;
676
677 ret = single_open(file, brcm_avs_debug_show, data);
678 if (ret)
679 kfree(data);
680 file->f_mode = fmode;
681
682 return ret;
683}
684
685static int brcm_avs_debug_release(struct inode *inode, struct file *file)
686{
687 struct seq_file *seq_priv = file->private_data;
688 struct debugfs_data *data = seq_priv->private;
689
690 kfree(data);
691 return single_release(inode, file);
692}
693
694static const struct file_operations brcm_avs_debug_ops = {
695 .open = brcm_avs_debug_open,
696 .read = seq_read,
697 .write = brcm_avs_seq_write,
698 .llseek = seq_lseek,
699 .release = brcm_avs_debug_release,
700};
701
702static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev)
703{
704 struct private_data *priv = platform_get_drvdata(pdev);
705 struct dentry *dir;
706 int i;
707
708 if (!priv)
709 return;
710
711 dir = debugfs_create_dir(BRCM_AVS_CPUFREQ_NAME, NULL);
712 if (IS_ERR_OR_NULL(dir))
713 return;
714 priv->debugfs = dir;
715
716 for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) {
717 /*
718 * The DEBUGFS_ENTRY macro generates uppercase strings. We
719 * convert them to lowercase before creating the debugfs
720 * entries.
721 */
722 char *entry = __strtolower(debugfs_entries[i].name);
723 fmode_t mode = debugfs_entries[i].mode;
724
725 if (!debugfs_create_file(entry, S_IFREG | S_IRUGO | mode,
726 dir, priv, &brcm_avs_debug_ops)) {
727 priv->debugfs = NULL;
728 debugfs_remove_recursive(dir);
729 break;
730 }
731 }
732}
733
734static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev)
735{
736 struct private_data *priv = platform_get_drvdata(pdev);
737
738 if (priv && priv->debugfs) {
739 debugfs_remove_recursive(priv->debugfs);
740 priv->debugfs = NULL;
741 }
742}
743
744#else
745
746static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) {}
747static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) {}
748
749#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
750
751/*
752 * To ensure the right firmware is running we need to
753 * - check the MAGIC matches what we expect
754 * - brcm_avs_get_pmap() doesn't return -ENOTSUPP or -EINVAL
755 * We need to set up our interrupt handling before calling brcm_avs_get_pmap()!
756 */
757static bool brcm_avs_is_firmware_loaded(struct private_data *priv)
758{
759 u32 magic;
760 int rc;
761
762 rc = brcm_avs_get_pmap(priv, NULL);
763 magic = readl(priv->base + AVS_MBOX_MAGIC);
764
765 return (magic == AVS_FIRMWARE_MAGIC) && (rc != -ENOTSUPP) &&
766 (rc != -EINVAL);
767}
768
769static unsigned int brcm_avs_cpufreq_get(unsigned int cpu)
770{
771 struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
772 struct private_data *priv = policy->driver_data;
773
774 return brcm_avs_get_frequency(priv->base);
775}
776
777static int brcm_avs_target_index(struct cpufreq_policy *policy,
778 unsigned int index)
779{
780 return brcm_avs_set_pstate(policy->driver_data,
781 policy->freq_table[index].driver_data);
782}
783
784static int brcm_avs_suspend(struct cpufreq_policy *policy)
785{
786 struct private_data *priv = policy->driver_data;
787
788 return brcm_avs_get_pmap(priv, &priv->pmap);
789}
790
791static int brcm_avs_resume(struct cpufreq_policy *policy)
792{
793 struct private_data *priv = policy->driver_data;
794 int ret;
795
796 ret = brcm_avs_set_pmap(priv, &priv->pmap);
797 if (ret == -EEXIST) {
798 struct platform_device *pdev = cpufreq_get_driver_data();
799 struct device *dev = &pdev->dev;
800
801 dev_warn(dev, "PMAP was already set\n");
802 ret = 0;
803 }
804
805 return ret;
806}
807
808/*
809 * All initialization code that we only want to execute once goes here. Setup
810 * code that can be re-tried on every core (if it failed before) can go into
811 * brcm_avs_cpufreq_init().
812 */
813static int brcm_avs_prepare_init(struct platform_device *pdev)
814{
815 struct private_data *priv;
816 struct device *dev;
817 int host_irq, ret;
818
819 dev = &pdev->dev;
820 priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
821 if (!priv)
822 return -ENOMEM;
823
824 priv->dev = dev;
825 sema_init(&priv->sem, 1);
826 init_completion(&priv->done);
827 platform_set_drvdata(pdev, priv);
828
829 priv->base = __map_region(BRCM_AVS_CPU_DATA);
830 if (!priv->base) {
831 dev_err(dev, "Couldn't find property %s in device tree.\n",
832 BRCM_AVS_CPU_DATA);
833 return -ENOENT;
834 }
835
836 priv->avs_intr_base = __map_region(BRCM_AVS_CPU_INTR);
837 if (!priv->avs_intr_base) {
838 dev_err(dev, "Couldn't find property %s in device tree.\n",
839 BRCM_AVS_CPU_INTR);
840 ret = -ENOENT;
841 goto unmap_base;
842 }
843
844 host_irq = platform_get_irq_byname(pdev, BRCM_AVS_HOST_INTR);
845 if (host_irq < 0) {
846 dev_err(dev, "Couldn't find interrupt %s -- %d\n",
847 BRCM_AVS_HOST_INTR, host_irq);
848 ret = host_irq;
849 goto unmap_intr_base;
850 }
851
852 ret = devm_request_irq(dev, host_irq, irq_handler, IRQF_TRIGGER_RISING,
853 BRCM_AVS_HOST_INTR, priv);
854 if (ret) {
855 dev_err(dev, "IRQ request failed: %s (%d) -- %d\n",
856 BRCM_AVS_HOST_INTR, host_irq, ret);
857 goto unmap_intr_base;
858 }
859
860 if (brcm_avs_is_firmware_loaded(priv))
861 return 0;
862
863 dev_err(dev, "AVS firmware is not loaded or doesn't support DVFS\n");
864 ret = -ENODEV;
865
866unmap_intr_base:
867 iounmap(priv->avs_intr_base);
868unmap_base:
869 iounmap(priv->base);
870 platform_set_drvdata(pdev, NULL);
871
872 return ret;
873}
874
875static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy)
876{
877 struct cpufreq_frequency_table *freq_table;
878 struct platform_device *pdev;
879 struct private_data *priv;
880 struct device *dev;
881 int ret;
882
883 pdev = cpufreq_get_driver_data();
884 priv = platform_get_drvdata(pdev);
885 policy->driver_data = priv;
886 dev = &pdev->dev;
887
888 freq_table = brcm_avs_get_freq_table(dev, priv);
889 if (IS_ERR(freq_table)) {
890 ret = PTR_ERR(freq_table);
891 dev_err(dev, "Couldn't determine frequency table (%d).\n", ret);
892 return ret;
893 }
894
895 ret = cpufreq_table_validate_and_show(policy, freq_table);
896 if (ret) {
897 dev_err(dev, "invalid frequency table: %d\n", ret);
898 return ret;
899 }
900
901 /* All cores share the same clock and thus the same policy. */
902 cpumask_setall(policy->cpus);
903
904 ret = __issue_avs_command(priv, AVS_CMD_ENABLE, false, NULL);
905 if (!ret) {
906 unsigned int pstate;
907
908 ret = brcm_avs_get_pstate(priv, &pstate);
909 if (!ret) {
910 policy->cur = freq_table[pstate].frequency;
911 dev_info(dev, "registered\n");
912 return 0;
913 }
914 }
915
916 dev_err(dev, "couldn't initialize driver (%d)\n", ret);
917
918 return ret;
919}
920
921static ssize_t show_brcm_avs_pstate(struct cpufreq_policy *policy, char *buf)
922{
923 struct private_data *priv = policy->driver_data;
924 unsigned int pstate;
925
926 if (brcm_avs_get_pstate(priv, &pstate))
927 return sprintf(buf, "<unknown>\n");
928
929 return sprintf(buf, "%u\n", pstate);
930}
931
932static ssize_t show_brcm_avs_mode(struct cpufreq_policy *policy, char *buf)
933{
934 struct private_data *priv = policy->driver_data;
935 struct pmap pmap;
936
937 if (brcm_avs_get_pmap(priv, &pmap))
938 return sprintf(buf, "<unknown>\n");
939
940 return sprintf(buf, "%s %u\n", brcm_avs_mode_to_string(pmap.mode),
941 pmap.mode);
942}
943
944static ssize_t show_brcm_avs_pmap(struct cpufreq_policy *policy, char *buf)
945{
946 unsigned int mdiv_p0, mdiv_p1, mdiv_p2, mdiv_p3, mdiv_p4;
947 struct private_data *priv = policy->driver_data;
948 unsigned int ndiv, pdiv;
949 struct pmap pmap;
950
951 if (brcm_avs_get_pmap(priv, &pmap))
952 return sprintf(buf, "<unknown>\n");
953
954 brcm_avs_parse_p1(pmap.p1, &mdiv_p0, &pdiv, &ndiv);
955 brcm_avs_parse_p2(pmap.p2, &mdiv_p1, &mdiv_p2, &mdiv_p3, &mdiv_p4);
956
957 return sprintf(buf, "0x%08x 0x%08x %u %u %u %u %u %u %u\n",
958 pmap.p1, pmap.p2, ndiv, pdiv, mdiv_p0, mdiv_p1, mdiv_p2,
959 mdiv_p3, mdiv_p4);
960}
961
962static ssize_t show_brcm_avs_voltage(struct cpufreq_policy *policy, char *buf)
963{
964 struct private_data *priv = policy->driver_data;
965
966 return sprintf(buf, "0x%08lx\n", brcm_avs_get_voltage(priv->base));
967}
968
969static ssize_t show_brcm_avs_frequency(struct cpufreq_policy *policy, char *buf)
970{
971 struct private_data *priv = policy->driver_data;
972
973 return sprintf(buf, "0x%08lx\n", brcm_avs_get_frequency(priv->base));
974}
975
976cpufreq_freq_attr_ro(brcm_avs_pstate);
977cpufreq_freq_attr_ro(brcm_avs_mode);
978cpufreq_freq_attr_ro(brcm_avs_pmap);
979cpufreq_freq_attr_ro(brcm_avs_voltage);
980cpufreq_freq_attr_ro(brcm_avs_frequency);
981
982static struct freq_attr *brcm_avs_cpufreq_attr[] = {
983 &cpufreq_freq_attr_scaling_available_freqs,
984 &brcm_avs_pstate,
985 &brcm_avs_mode,
986 &brcm_avs_pmap,
987 &brcm_avs_voltage,
988 &brcm_avs_frequency,
989 NULL
990};
991
992static struct cpufreq_driver brcm_avs_driver = {
993 .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
994 .verify = cpufreq_generic_frequency_table_verify,
995 .target_index = brcm_avs_target_index,
996 .get = brcm_avs_cpufreq_get,
997 .suspend = brcm_avs_suspend,
998 .resume = brcm_avs_resume,
999 .init = brcm_avs_cpufreq_init,
1000 .attr = brcm_avs_cpufreq_attr,
1001 .name = BRCM_AVS_CPUFREQ_PREFIX,
1002};
1003
1004static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
1005{
1006 int ret;
1007
1008 ret = brcm_avs_prepare_init(pdev);
1009 if (ret)
1010 return ret;
1011
1012 brcm_avs_driver.driver_data = pdev;
1013 ret = cpufreq_register_driver(&brcm_avs_driver);
1014 if (!ret)
1015 brcm_avs_cpufreq_debug_init(pdev);
1016
1017 return ret;
1018}
1019
1020static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
1021{
1022 struct private_data *priv;
1023 int ret;
1024
1025 ret = cpufreq_unregister_driver(&brcm_avs_driver);
1026 if (ret)
1027 return ret;
1028
1029 brcm_avs_cpufreq_debug_exit(pdev);
1030
1031 priv = platform_get_drvdata(pdev);
1032 iounmap(priv->base);
1033 iounmap(priv->avs_intr_base);
1034 platform_set_drvdata(pdev, NULL);
1035
1036 return 0;
1037}
1038
1039static const struct of_device_id brcm_avs_cpufreq_match[] = {
1040 { .compatible = BRCM_AVS_CPU_DATA },
1041 { }
1042};
1043MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match);
1044
1045static struct platform_driver brcm_avs_cpufreq_platdrv = {
1046 .driver = {
1047 .name = BRCM_AVS_CPUFREQ_NAME,
1048 .of_match_table = brcm_avs_cpufreq_match,
1049 },
1050 .probe = brcm_avs_cpufreq_probe,
1051 .remove = brcm_avs_cpufreq_remove,
1052};
1053module_platform_driver(brcm_avs_cpufreq_platdrv);
1054
1055MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
1056MODULE_DESCRIPTION("CPUfreq driver for Broadcom STB AVS");
1057MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 4852d9efe74e..e82bb3c30b92 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -247,3 +247,10 @@ MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec");
247MODULE_LICENSE("GPL"); 247MODULE_LICENSE("GPL");
248 248
249late_initcall(cppc_cpufreq_init); 249late_initcall(cppc_cpufreq_init);
250
251static const struct acpi_device_id cppc_acpi_ids[] = {
252 {ACPI_PROCESSOR_DEVICE_HID, },
253 {}
254};
255
256MODULE_DEVICE_TABLE(acpi, cppc_acpi_ids);
diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 71267626456b..bc97b6a4b1cf 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -26,6 +26,9 @@ static const struct of_device_id machines[] __initconst = {
26 { .compatible = "allwinner,sun8i-a83t", }, 26 { .compatible = "allwinner,sun8i-a83t", },
27 { .compatible = "allwinner,sun8i-h3", }, 27 { .compatible = "allwinner,sun8i-h3", },
28 28
29 { .compatible = "arm,integrator-ap", },
30 { .compatible = "arm,integrator-cp", },
31
29 { .compatible = "hisilicon,hi6220", }, 32 { .compatible = "hisilicon,hi6220", },
30 33
31 { .compatible = "fsl,imx27", }, 34 { .compatible = "fsl,imx27", },
@@ -34,6 +37,8 @@ static const struct of_device_id machines[] __initconst = {
34 { .compatible = "fsl,imx7d", }, 37 { .compatible = "fsl,imx7d", },
35 38
36 { .compatible = "marvell,berlin", }, 39 { .compatible = "marvell,berlin", },
40 { .compatible = "marvell,pxa250", },
41 { .compatible = "marvell,pxa270", },
37 42
38 { .compatible = "samsung,exynos3250", }, 43 { .compatible = "samsung,exynos3250", },
39 { .compatible = "samsung,exynos4210", }, 44 { .compatible = "samsung,exynos4210", },
@@ -50,6 +55,8 @@ static const struct of_device_id machines[] __initconst = {
50 { .compatible = "renesas,r7s72100", }, 55 { .compatible = "renesas,r7s72100", },
51 { .compatible = "renesas,r8a73a4", }, 56 { .compatible = "renesas,r8a73a4", },
52 { .compatible = "renesas,r8a7740", }, 57 { .compatible = "renesas,r8a7740", },
58 { .compatible = "renesas,r8a7743", },
59 { .compatible = "renesas,r8a7745", },
53 { .compatible = "renesas,r8a7778", }, 60 { .compatible = "renesas,r8a7778", },
54 { .compatible = "renesas,r8a7779", }, 61 { .compatible = "renesas,r8a7779", },
55 { .compatible = "renesas,r8a7790", }, 62 { .compatible = "renesas,r8a7790", },
@@ -72,6 +79,12 @@ static const struct of_device_id machines[] __initconst = {
72 79
73 { .compatible = "sigma,tango4" }, 80 { .compatible = "sigma,tango4" },
74 81
82 { .compatible = "socionext,uniphier-pro5", },
83 { .compatible = "socionext,uniphier-pxs2", },
84 { .compatible = "socionext,uniphier-ld6b", },
85 { .compatible = "socionext,uniphier-ld11", },
86 { .compatible = "socionext,uniphier-ld20", },
87
75 { .compatible = "ti,am33xx", }, 88 { .compatible = "ti,am33xx", },
76 { .compatible = "ti,dra7", }, 89 { .compatible = "ti,dra7", },
77 { .compatible = "ti,omap2", }, 90 { .compatible = "ti,omap2", },
@@ -81,6 +94,8 @@ static const struct of_device_id machines[] __initconst = {
81 94
82 { .compatible = "xlnx,zynq-7000", }, 95 { .compatible = "xlnx,zynq-7000", },
83 96
97 { .compatible = "zte,zx296718", },
98
84 { } 99 { }
85}; 100};
86 101
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 5c07ae05d69a..269013311e79 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -28,6 +28,7 @@
28#include "cpufreq-dt.h" 28#include "cpufreq-dt.h"
29 29
30struct private_data { 30struct private_data {
31 struct opp_table *opp_table;
31 struct device *cpu_dev; 32 struct device *cpu_dev;
32 struct thermal_cooling_device *cdev; 33 struct thermal_cooling_device *cdev;
33 const char *reg_name; 34 const char *reg_name;
@@ -143,6 +144,7 @@ static int resources_available(void)
143static int cpufreq_init(struct cpufreq_policy *policy) 144static int cpufreq_init(struct cpufreq_policy *policy)
144{ 145{
145 struct cpufreq_frequency_table *freq_table; 146 struct cpufreq_frequency_table *freq_table;
147 struct opp_table *opp_table = NULL;
146 struct private_data *priv; 148 struct private_data *priv;
147 struct device *cpu_dev; 149 struct device *cpu_dev;
148 struct clk *cpu_clk; 150 struct clk *cpu_clk;
@@ -186,8 +188,9 @@ static int cpufreq_init(struct cpufreq_policy *policy)
186 */ 188 */
187 name = find_supply_name(cpu_dev); 189 name = find_supply_name(cpu_dev);
188 if (name) { 190 if (name) {
189 ret = dev_pm_opp_set_regulator(cpu_dev, name); 191 opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1);
190 if (ret) { 192 if (IS_ERR(opp_table)) {
193 ret = PTR_ERR(opp_table);
191 dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n", 194 dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
192 policy->cpu, ret); 195 policy->cpu, ret);
193 goto out_put_clk; 196 goto out_put_clk;
@@ -237,6 +240,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
237 } 240 }
238 241
239 priv->reg_name = name; 242 priv->reg_name = name;
243 priv->opp_table = opp_table;
240 244
241 ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); 245 ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
242 if (ret) { 246 if (ret) {
@@ -285,7 +289,7 @@ out_free_priv:
285out_free_opp: 289out_free_opp:
286 dev_pm_opp_of_cpumask_remove_table(policy->cpus); 290 dev_pm_opp_of_cpumask_remove_table(policy->cpus);
287 if (name) 291 if (name)
288 dev_pm_opp_put_regulator(cpu_dev); 292 dev_pm_opp_put_regulators(opp_table);
289out_put_clk: 293out_put_clk:
290 clk_put(cpu_clk); 294 clk_put(cpu_clk);
291 295
@@ -300,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
300 dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); 304 dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
301 dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); 305 dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
302 if (priv->reg_name) 306 if (priv->reg_name)
303 dev_pm_opp_put_regulator(priv->cpu_dev); 307 dev_pm_opp_put_regulators(priv->opp_table);
304 308
305 clk_put(policy->clk); 309 clk_put(policy->clk);
306 kfree(priv); 310 kfree(priv);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6e6c1fb60fbc..cc475eff90b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1526,7 +1526,10 @@ unsigned int cpufreq_get(unsigned int cpu)
1526 1526
1527 if (policy) { 1527 if (policy) {
1528 down_read(&policy->rwsem); 1528 down_read(&policy->rwsem);
1529 ret_freq = __cpufreq_get(policy); 1529
1530 if (!policy_is_inactive(policy))
1531 ret_freq = __cpufreq_get(policy);
1532
1530 up_read(&policy->rwsem); 1533 up_read(&policy->rwsem);
1531 1534
1532 cpufreq_cpu_put(policy); 1535 cpufreq_cpu_put(policy);
@@ -2254,17 +2257,19 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
2254 * Useful for policy notifiers which have different necessities 2257 * Useful for policy notifiers which have different necessities
2255 * at different times. 2258 * at different times.
2256 */ 2259 */
2257int cpufreq_update_policy(unsigned int cpu) 2260void cpufreq_update_policy(unsigned int cpu)
2258{ 2261{
2259 struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); 2262 struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
2260 struct cpufreq_policy new_policy; 2263 struct cpufreq_policy new_policy;
2261 int ret;
2262 2264
2263 if (!policy) 2265 if (!policy)
2264 return -ENODEV; 2266 return;
2265 2267
2266 down_write(&policy->rwsem); 2268 down_write(&policy->rwsem);
2267 2269
2270 if (policy_is_inactive(policy))
2271 goto unlock;
2272
2268 pr_debug("updating policy for CPU %u\n", cpu); 2273 pr_debug("updating policy for CPU %u\n", cpu);
2269 memcpy(&new_policy, policy, sizeof(*policy)); 2274 memcpy(&new_policy, policy, sizeof(*policy));
2270 new_policy.min = policy->user_policy.min; 2275 new_policy.min = policy->user_policy.min;
@@ -2275,24 +2280,20 @@ int cpufreq_update_policy(unsigned int cpu)
2275 * -> ask driver for current freq and notify governors about a change 2280 * -> ask driver for current freq and notify governors about a change
2276 */ 2281 */
2277 if (cpufreq_driver->get && !cpufreq_driver->setpolicy) { 2282 if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
2278 if (cpufreq_suspended) { 2283 if (cpufreq_suspended)
2279 ret = -EAGAIN;
2280 goto unlock; 2284 goto unlock;
2281 } 2285
2282 new_policy.cur = cpufreq_update_current_freq(policy); 2286 new_policy.cur = cpufreq_update_current_freq(policy);
2283 if (WARN_ON(!new_policy.cur)) { 2287 if (WARN_ON(!new_policy.cur))
2284 ret = -EIO;
2285 goto unlock; 2288 goto unlock;
2286 }
2287 } 2289 }
2288 2290
2289 ret = cpufreq_set_policy(policy, &new_policy); 2291 cpufreq_set_policy(policy, &new_policy);
2290 2292
2291unlock: 2293unlock:
2292 up_write(&policy->rwsem); 2294 up_write(&policy->rwsem);
2293 2295
2294 cpufreq_cpu_put(policy); 2296 cpufreq_cpu_put(policy);
2295 return ret;
2296} 2297}
2297EXPORT_SYMBOL(cpufreq_update_policy); 2298EXPORT_SYMBOL(cpufreq_update_policy);
2298 2299
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 13475890d792..992f7c20760f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -37,16 +37,16 @@ struct cs_dbs_tuners {
37#define DEF_SAMPLING_DOWN_FACTOR (1) 37#define DEF_SAMPLING_DOWN_FACTOR (1)
38#define MAX_SAMPLING_DOWN_FACTOR (10) 38#define MAX_SAMPLING_DOWN_FACTOR (10)
39 39
40static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners, 40static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners,
41 struct cpufreq_policy *policy) 41 struct cpufreq_policy *policy)
42{ 42{
43 unsigned int freq_target = (cs_tuners->freq_step * policy->max) / 100; 43 unsigned int freq_step = (cs_tuners->freq_step * policy->max) / 100;
44 44
45 /* max freq cannot be less than 100. But who knows... */ 45 /* max freq cannot be less than 100. But who knows... */
46 if (unlikely(freq_target == 0)) 46 if (unlikely(freq_step == 0))
47 freq_target = DEF_FREQUENCY_STEP; 47 freq_step = DEF_FREQUENCY_STEP;
48 48
49 return freq_target; 49 return freq_step;
50} 50}
51 51
52/* 52/*
@@ -55,10 +55,10 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
55 * sampling_down_factor, we check, if current idle time is more than 80% 55 * sampling_down_factor, we check, if current idle time is more than 80%
56 * (default), then we try to decrease frequency 56 * (default), then we try to decrease frequency
57 * 57 *
58 * Any frequency increase takes it to the maximum frequency. Frequency reduction 58 * Frequency updates happen at minimum steps of 5% (default) of maximum
59 * happens at minimum steps of 5% (default) of maximum frequency 59 * frequency
60 */ 60 */
61static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) 61static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
62{ 62{
63 struct policy_dbs_info *policy_dbs = policy->governor_data; 63 struct policy_dbs_info *policy_dbs = policy->governor_data;
64 struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs); 64 struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
@@ -66,6 +66,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
66 struct dbs_data *dbs_data = policy_dbs->dbs_data; 66 struct dbs_data *dbs_data = policy_dbs->dbs_data;
67 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 67 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
68 unsigned int load = dbs_update(policy); 68 unsigned int load = dbs_update(policy);
69 unsigned int freq_step;
69 70
70 /* 71 /*
71 * break out if we 'cannot' reduce the speed as the user might 72 * break out if we 'cannot' reduce the speed as the user might
@@ -82,6 +83,23 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
82 if (requested_freq > policy->max || requested_freq < policy->min) 83 if (requested_freq > policy->max || requested_freq < policy->min)
83 requested_freq = policy->cur; 84 requested_freq = policy->cur;
84 85
86 freq_step = get_freq_step(cs_tuners, policy);
87
88 /*
89 * Decrease requested_freq one freq_step for each idle period that
90 * we didn't update the frequency.
91 */
92 if (policy_dbs->idle_periods < UINT_MAX) {
93 unsigned int freq_steps = policy_dbs->idle_periods * freq_step;
94
95 if (requested_freq > freq_steps)
96 requested_freq -= freq_steps;
97 else
98 requested_freq = policy->min;
99
100 policy_dbs->idle_periods = UINT_MAX;
101 }
102
85 /* Check for frequency increase */ 103 /* Check for frequency increase */
86 if (load > dbs_data->up_threshold) { 104 if (load > dbs_data->up_threshold) {
87 dbs_info->down_skip = 0; 105 dbs_info->down_skip = 0;
@@ -90,7 +108,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
90 if (requested_freq == policy->max) 108 if (requested_freq == policy->max)
91 goto out; 109 goto out;
92 110
93 requested_freq += get_freq_target(cs_tuners, policy); 111 requested_freq += freq_step;
94 if (requested_freq > policy->max) 112 if (requested_freq > policy->max)
95 requested_freq = policy->max; 113 requested_freq = policy->max;
96 114
@@ -106,16 +124,14 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
106 124
107 /* Check for frequency decrease */ 125 /* Check for frequency decrease */
108 if (load < cs_tuners->down_threshold) { 126 if (load < cs_tuners->down_threshold) {
109 unsigned int freq_target;
110 /* 127 /*
111 * if we cannot reduce the frequency anymore, break out early 128 * if we cannot reduce the frequency anymore, break out early
112 */ 129 */
113 if (requested_freq == policy->min) 130 if (requested_freq == policy->min)
114 goto out; 131 goto out;
115 132
116 freq_target = get_freq_target(cs_tuners, policy); 133 if (requested_freq > freq_step)
117 if (requested_freq > freq_target) 134 requested_freq -= freq_step;
118 requested_freq -= freq_target;
119 else 135 else
120 requested_freq = policy->min; 136 requested_freq = policy->min;
121 137
@@ -305,7 +321,7 @@ static void cs_start(struct cpufreq_policy *policy)
305static struct dbs_governor cs_governor = { 321static struct dbs_governor cs_governor = {
306 .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"), 322 .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
307 .kobj_type = { .default_attrs = cs_attributes }, 323 .kobj_type = { .default_attrs = cs_attributes },
308 .gov_dbs_timer = cs_dbs_timer, 324 .gov_dbs_update = cs_dbs_update,
309 .alloc = cs_alloc, 325 .alloc = cs_alloc,
310 .free = cs_free, 326 .free = cs_free,
311 .init = cs_init, 327 .init = cs_init,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 642dd0f183a8..0196467280bd 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -61,7 +61,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
61 * entries can't be freed concurrently. 61 * entries can't be freed concurrently.
62 */ 62 */
63 list_for_each_entry(policy_dbs, &attr_set->policy_list, list) { 63 list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
64 mutex_lock(&policy_dbs->timer_mutex); 64 mutex_lock(&policy_dbs->update_mutex);
65 /* 65 /*
66 * On 32-bit architectures this may race with the 66 * On 32-bit architectures this may race with the
67 * sample_delay_ns read in dbs_update_util_handler(), but that 67 * sample_delay_ns read in dbs_update_util_handler(), but that
@@ -76,7 +76,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
76 * taken, so it shouldn't be significant. 76 * taken, so it shouldn't be significant.
77 */ 77 */
78 gov_update_sample_delay(policy_dbs, 0); 78 gov_update_sample_delay(policy_dbs, 0);
79 mutex_unlock(&policy_dbs->timer_mutex); 79 mutex_unlock(&policy_dbs->update_mutex);
80 } 80 }
81 81
82 return count; 82 return count;
@@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
117 struct policy_dbs_info *policy_dbs = policy->governor_data; 117 struct policy_dbs_info *policy_dbs = policy->governor_data;
118 struct dbs_data *dbs_data = policy_dbs->dbs_data; 118 struct dbs_data *dbs_data = policy_dbs->dbs_data;
119 unsigned int ignore_nice = dbs_data->ignore_nice_load; 119 unsigned int ignore_nice = dbs_data->ignore_nice_load;
120 unsigned int max_load = 0; 120 unsigned int max_load = 0, idle_periods = UINT_MAX;
121 unsigned int sampling_rate, io_busy, j; 121 unsigned int sampling_rate, io_busy, j;
122 122
123 /* 123 /*
@@ -215,9 +215,19 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
215 j_cdbs->prev_load = load; 215 j_cdbs->prev_load = load;
216 } 216 }
217 217
218 if (time_elapsed > 2 * sampling_rate) {
219 unsigned int periods = time_elapsed / sampling_rate;
220
221 if (periods < idle_periods)
222 idle_periods = periods;
223 }
224
218 if (load > max_load) 225 if (load > max_load)
219 max_load = load; 226 max_load = load;
220 } 227 }
228
229 policy_dbs->idle_periods = idle_periods;
230
221 return max_load; 231 return max_load;
222} 232}
223EXPORT_SYMBOL_GPL(dbs_update); 233EXPORT_SYMBOL_GPL(dbs_update);
@@ -236,9 +246,9 @@ static void dbs_work_handler(struct work_struct *work)
236 * Make sure cpufreq_governor_limits() isn't evaluating load or the 246 * Make sure cpufreq_governor_limits() isn't evaluating load or the
237 * ondemand governor isn't updating the sampling rate in parallel. 247 * ondemand governor isn't updating the sampling rate in parallel.
238 */ 248 */
239 mutex_lock(&policy_dbs->timer_mutex); 249 mutex_lock(&policy_dbs->update_mutex);
240 gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy)); 250 gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy));
241 mutex_unlock(&policy_dbs->timer_mutex); 251 mutex_unlock(&policy_dbs->update_mutex);
242 252
243 /* Allow the utilization update handler to queue up more work. */ 253 /* Allow the utilization update handler to queue up more work. */
244 atomic_set(&policy_dbs->work_count, 0); 254 atomic_set(&policy_dbs->work_count, 0);
@@ -348,7 +358,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
348 return NULL; 358 return NULL;
349 359
350 policy_dbs->policy = policy; 360 policy_dbs->policy = policy;
351 mutex_init(&policy_dbs->timer_mutex); 361 mutex_init(&policy_dbs->update_mutex);
352 atomic_set(&policy_dbs->work_count, 0); 362 atomic_set(&policy_dbs->work_count, 0);
353 init_irq_work(&policy_dbs->irq_work, dbs_irq_work); 363 init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
354 INIT_WORK(&policy_dbs->work, dbs_work_handler); 364 INIT_WORK(&policy_dbs->work, dbs_work_handler);
@@ -367,7 +377,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
367{ 377{
368 int j; 378 int j;
369 379
370 mutex_destroy(&policy_dbs->timer_mutex); 380 mutex_destroy(&policy_dbs->update_mutex);
371 381
372 for_each_cpu(j, policy_dbs->policy->related_cpus) { 382 for_each_cpu(j, policy_dbs->policy->related_cpus) {
373 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j); 383 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
@@ -547,10 +557,10 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
547{ 557{
548 struct policy_dbs_info *policy_dbs = policy->governor_data; 558 struct policy_dbs_info *policy_dbs = policy->governor_data;
549 559
550 mutex_lock(&policy_dbs->timer_mutex); 560 mutex_lock(&policy_dbs->update_mutex);
551 cpufreq_policy_apply_limits(policy); 561 cpufreq_policy_apply_limits(policy);
552 gov_update_sample_delay(policy_dbs, 0); 562 gov_update_sample_delay(policy_dbs, 0);
553 563
554 mutex_unlock(&policy_dbs->timer_mutex); 564 mutex_unlock(&policy_dbs->update_mutex);
555} 565}
556EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits); 566EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ef1037e9c92b..f5717ca070cc 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -85,7 +85,7 @@ struct policy_dbs_info {
85 * Per policy mutex that serializes load evaluation from limit-change 85 * Per policy mutex that serializes load evaluation from limit-change
86 * and work-handler. 86 * and work-handler.
87 */ 87 */
88 struct mutex timer_mutex; 88 struct mutex update_mutex;
89 89
90 u64 last_sample_time; 90 u64 last_sample_time;
91 s64 sample_delay_ns; 91 s64 sample_delay_ns;
@@ -97,6 +97,7 @@ struct policy_dbs_info {
97 struct list_head list; 97 struct list_head list;
98 /* Multiplier for increasing sample delay temporarily. */ 98 /* Multiplier for increasing sample delay temporarily. */
99 unsigned int rate_mult; 99 unsigned int rate_mult;
100 unsigned int idle_periods; /* For conservative */
100 /* Status indicators */ 101 /* Status indicators */
101 bool is_shared; /* This object is used by multiple CPUs */ 102 bool is_shared; /* This object is used by multiple CPUs */
102 bool work_in_progress; /* Work is being queued up or in progress */ 103 bool work_in_progress; /* Work is being queued up or in progress */
@@ -135,7 +136,7 @@ struct dbs_governor {
135 */ 136 */
136 struct dbs_data *gdbs_data; 137 struct dbs_data *gdbs_data;
137 138
138 unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy); 139 unsigned int (*gov_dbs_update)(struct cpufreq_policy *policy);
139 struct policy_dbs_info *(*alloc)(void); 140 struct policy_dbs_info *(*alloc)(void);
140 void (*free)(struct policy_dbs_info *policy_dbs); 141 void (*free)(struct policy_dbs_info *policy_dbs);
141 int (*init)(struct dbs_data *dbs_data); 142 int (*init)(struct dbs_data *dbs_data);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f5f4c6..4a017e895296 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -25,7 +25,7 @@
25#define MAX_SAMPLING_DOWN_FACTOR (100000) 25#define MAX_SAMPLING_DOWN_FACTOR (100000)
26#define MICRO_FREQUENCY_UP_THRESHOLD (95) 26#define MICRO_FREQUENCY_UP_THRESHOLD (95)
27#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) 27#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
28#define MIN_FREQUENCY_UP_THRESHOLD (11) 28#define MIN_FREQUENCY_UP_THRESHOLD (1)
29#define MAX_FREQUENCY_UP_THRESHOLD (100) 29#define MAX_FREQUENCY_UP_THRESHOLD (100)
30 30
31static struct od_ops od_ops; 31static struct od_ops od_ops;
@@ -169,7 +169,7 @@ static void od_update(struct cpufreq_policy *policy)
169 } 169 }
170} 170}
171 171
172static unsigned int od_dbs_timer(struct cpufreq_policy *policy) 172static unsigned int od_dbs_update(struct cpufreq_policy *policy)
173{ 173{
174 struct policy_dbs_info *policy_dbs = policy->governor_data; 174 struct policy_dbs_info *policy_dbs = policy->governor_data;
175 struct dbs_data *dbs_data = policy_dbs->dbs_data; 175 struct dbs_data *dbs_data = policy_dbs->dbs_data;
@@ -191,7 +191,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
191 od_update(policy); 191 od_update(policy);
192 192
193 if (dbs_info->freq_lo) { 193 if (dbs_info->freq_lo) {
194 /* Setup timer for SUB_SAMPLE */ 194 /* Setup SUB_SAMPLE */
195 dbs_info->sample_type = OD_SUB_SAMPLE; 195 dbs_info->sample_type = OD_SUB_SAMPLE;
196 return dbs_info->freq_hi_delay_us; 196 return dbs_info->freq_hi_delay_us;
197 } 197 }
@@ -255,11 +255,11 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
255 list_for_each_entry(policy_dbs, &attr_set->policy_list, list) { 255 list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
256 /* 256 /*
257 * Doing this without locking might lead to using different 257 * Doing this without locking might lead to using different
258 * rate_mult values in od_update() and od_dbs_timer(). 258 * rate_mult values in od_update() and od_dbs_update().
259 */ 259 */
260 mutex_lock(&policy_dbs->timer_mutex); 260 mutex_lock(&policy_dbs->update_mutex);
261 policy_dbs->rate_mult = 1; 261 policy_dbs->rate_mult = 1;
262 mutex_unlock(&policy_dbs->timer_mutex); 262 mutex_unlock(&policy_dbs->update_mutex);
263 } 263 }
264 264
265 return count; 265 return count;
@@ -374,8 +374,7 @@ static int od_init(struct dbs_data *dbs_data)
374 dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; 374 dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
375 /* 375 /*
376 * In nohz/micro accounting case we set the minimum frequency 376 * In nohz/micro accounting case we set the minimum frequency
377 * not depending on HZ, but fixed (very low). The deferred 377 * not depending on HZ, but fixed (very low).
378 * timer might skip some samples if idle/sleeping as needed.
379 */ 378 */
380 dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; 379 dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
381 } else { 380 } else {
@@ -415,7 +414,7 @@ static struct od_ops od_ops = {
415static struct dbs_governor od_dbs_gov = { 414static struct dbs_governor od_dbs_gov = {
416 .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"), 415 .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"),
417 .kobj_type = { .default_attrs = od_attributes }, 416 .kobj_type = { .default_attrs = od_attributes },
418 .gov_dbs_timer = od_dbs_timer, 417 .gov_dbs_update = od_dbs_update,
419 .alloc = od_alloc, 418 .alloc = od_alloc,
420 .free = od_free, 419 .free = od_free,
421 .init = od_init, 420 .init = od_init,
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 06d3abdffd3a..ac284e66839c 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -41,6 +41,18 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats)
41 return 0; 41 return 0;
42} 42}
43 43
44static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
45{
46 unsigned int count = stats->max_state;
47
48 memset(stats->time_in_state, 0, count * sizeof(u64));
49#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
50 memset(stats->trans_table, 0, count * count * sizeof(int));
51#endif
52 stats->last_time = get_jiffies_64();
53 stats->total_trans = 0;
54}
55
44static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) 56static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
45{ 57{
46 return sprintf(buf, "%d\n", policy->stats->total_trans); 58 return sprintf(buf, "%d\n", policy->stats->total_trans);
@@ -64,6 +76,14 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
64 return len; 76 return len;
65} 77}
66 78
79static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
80 size_t count)
81{
82 /* We don't care what is written to the attribute. */
83 cpufreq_stats_clear_table(policy->stats);
84 return count;
85}
86
67#ifdef CONFIG_CPU_FREQ_STAT_DETAILS 87#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
68static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) 88static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
69{ 89{
@@ -113,10 +133,12 @@ cpufreq_freq_attr_ro(trans_table);
113 133
114cpufreq_freq_attr_ro(total_trans); 134cpufreq_freq_attr_ro(total_trans);
115cpufreq_freq_attr_ro(time_in_state); 135cpufreq_freq_attr_ro(time_in_state);
136cpufreq_freq_attr_wo(reset);
116 137
117static struct attribute *default_attrs[] = { 138static struct attribute *default_attrs[] = {
118 &total_trans.attr, 139 &total_trans.attr,
119 &time_in_state.attr, 140 &time_in_state.attr,
141 &reset.attr,
120#ifdef CONFIG_CPU_FREQ_STAT_DETAILS 142#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
121 &trans_table.attr, 143 &trans_table.attr,
122#endif 144#endif
diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
deleted file mode 100644
index 79e3ff2771a6..000000000000
--- a/drivers/cpufreq/integrator-cpufreq.c
+++ /dev/null
@@ -1,239 +0,0 @@
1/*
2 * Copyright (C) 2001-2002 Deep Blue Solutions Ltd.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * CPU support functions
9 */
10#include <linux/module.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/cpufreq.h>
14#include <linux/sched.h>
15#include <linux/smp.h>
16#include <linux/init.h>
17#include <linux/io.h>
18#include <linux/platform_device.h>
19#include <linux/of.h>
20#include <linux/of_address.h>
21
22#include <asm/mach-types.h>
23#include <asm/hardware/icst.h>
24
25static void __iomem *cm_base;
26/* The cpufreq driver only use the OSC register */
27#define INTEGRATOR_HDR_OSC_OFFSET 0x08
28#define INTEGRATOR_HDR_LOCK_OFFSET 0x14
29
30static struct cpufreq_driver integrator_driver;
31
32static const struct icst_params lclk_params = {
33 .ref = 24000000,
34 .vco_max = ICST525_VCO_MAX_5V,
35 .vco_min = ICST525_VCO_MIN,
36 .vd_min = 8,
37 .vd_max = 132,
38 .rd_min = 24,
39 .rd_max = 24,
40 .s2div = icst525_s2div,
41 .idx2s = icst525_idx2s,
42};
43
44static const struct icst_params cclk_params = {
45 .ref = 24000000,
46 .vco_max = ICST525_VCO_MAX_5V,
47 .vco_min = ICST525_VCO_MIN,
48 .vd_min = 12,
49 .vd_max = 160,
50 .rd_min = 24,
51 .rd_max = 24,
52 .s2div = icst525_s2div,
53 .idx2s = icst525_idx2s,
54};
55
56/*
57 * Validate the speed policy.
58 */
59static int integrator_verify_policy(struct cpufreq_policy *policy)
60{
61 struct icst_vco vco;
62
63 cpufreq_verify_within_cpu_limits(policy);
64
65 vco = icst_hz_to_vco(&cclk_params, policy->max * 1000);
66 policy->max = icst_hz(&cclk_params, vco) / 1000;
67
68 vco = icst_hz_to_vco(&cclk_params, policy->min * 1000);
69 policy->min = icst_hz(&cclk_params, vco) / 1000;
70
71 cpufreq_verify_within_cpu_limits(policy);
72 return 0;
73}
74
75
76static int integrator_set_target(struct cpufreq_policy *policy,
77 unsigned int target_freq,
78 unsigned int relation)
79{
80 cpumask_t cpus_allowed;
81 int cpu = policy->cpu;
82 struct icst_vco vco;
83 struct cpufreq_freqs freqs;
84 u_int cm_osc;
85
86 /*
87 * Save this threads cpus_allowed mask.
88 */
89 cpus_allowed = current->cpus_allowed;
90
91 /*
92 * Bind to the specified CPU. When this call returns,
93 * we should be running on the right CPU.
94 */
95 set_cpus_allowed_ptr(current, cpumask_of(cpu));
96 BUG_ON(cpu != smp_processor_id());
97
98 /* get current setting */
99 cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
100
101 if (machine_is_integrator())
102 vco.s = (cm_osc >> 8) & 7;
103 else if (machine_is_cintegrator())
104 vco.s = 1;
105 vco.v = cm_osc & 255;
106 vco.r = 22;
107 freqs.old = icst_hz(&cclk_params, vco) / 1000;
108
109 /* icst_hz_to_vco rounds down -- so we need the next
110 * larger freq in case of CPUFREQ_RELATION_L.
111 */
112 if (relation == CPUFREQ_RELATION_L)
113 target_freq += 999;
114 if (target_freq > policy->max)
115 target_freq = policy->max;
116 vco = icst_hz_to_vco(&cclk_params, target_freq * 1000);
117 freqs.new = icst_hz(&cclk_params, vco) / 1000;
118
119 if (freqs.old == freqs.new) {
120 set_cpus_allowed_ptr(current, &cpus_allowed);
121 return 0;
122 }
123
124 cpufreq_freq_transition_begin(policy, &freqs);
125
126 cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
127
128 if (machine_is_integrator()) {
129 cm_osc &= 0xfffff800;
130 cm_osc |= vco.s << 8;
131 } else if (machine_is_cintegrator()) {
132 cm_osc &= 0xffffff00;
133 }
134 cm_osc |= vco.v;
135
136 __raw_writel(0xa05f, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
137 __raw_writel(cm_osc, cm_base + INTEGRATOR_HDR_OSC_OFFSET);
138 __raw_writel(0, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
139
140 /*
141 * Restore the CPUs allowed mask.
142 */
143 set_cpus_allowed_ptr(current, &cpus_allowed);
144
145 cpufreq_freq_transition_end(policy, &freqs, 0);
146
147 return 0;
148}
149
150static unsigned int integrator_get(unsigned int cpu)
151{
152 cpumask_t cpus_allowed;
153 unsigned int current_freq;
154 u_int cm_osc;
155 struct icst_vco vco;
156
157 cpus_allowed = current->cpus_allowed;
158
159 set_cpus_allowed_ptr(current, cpumask_of(cpu));
160 BUG_ON(cpu != smp_processor_id());
161
162 /* detect memory etc. */
163 cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
164
165 if (machine_is_integrator())
166 vco.s = (cm_osc >> 8) & 7;
167 else
168 vco.s = 1;
169 vco.v = cm_osc & 255;
170 vco.r = 22;
171
172 current_freq = icst_hz(&cclk_params, vco) / 1000; /* current freq */
173
174 set_cpus_allowed_ptr(current, &cpus_allowed);
175
176 return current_freq;
177}
178
179static int integrator_cpufreq_init(struct cpufreq_policy *policy)
180{
181
182 /* set default policy and cpuinfo */
183 policy->max = policy->cpuinfo.max_freq = 160000;
184 policy->min = policy->cpuinfo.min_freq = 12000;
185 policy->cpuinfo.transition_latency = 1000000; /* 1 ms, assumed */
186
187 return 0;
188}
189
190static struct cpufreq_driver integrator_driver = {
191 .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
192 .verify = integrator_verify_policy,
193 .target = integrator_set_target,
194 .get = integrator_get,
195 .init = integrator_cpufreq_init,
196 .name = "integrator",
197};
198
199static int __init integrator_cpufreq_probe(struct platform_device *pdev)
200{
201 struct resource *res;
202
203 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
204 if (!res)
205 return -ENODEV;
206
207 cm_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
208 if (!cm_base)
209 return -ENODEV;
210
211 return cpufreq_register_driver(&integrator_driver);
212}
213
214static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
215{
216 return cpufreq_unregister_driver(&integrator_driver);
217}
218
219static const struct of_device_id integrator_cpufreq_match[] = {
220 { .compatible = "arm,core-module-integrator"},
221 { },
222};
223
224MODULE_DEVICE_TABLE(of, integrator_cpufreq_match);
225
226static struct platform_driver integrator_cpufreq_driver = {
227 .driver = {
228 .name = "integrator-cpufreq",
229 .of_match_table = integrator_cpufreq_match,
230 },
231 .remove = __exit_p(integrator_cpufreq_remove),
232};
233
234module_platform_driver_probe(integrator_cpufreq_driver,
235 integrator_cpufreq_probe);
236
237MODULE_AUTHOR("Russell M. King");
238MODULE_DESCRIPTION("cpufreq driver for ARM Integrator CPUs");
239MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e8dc42fc0915..6acbd4af632e 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -37,6 +37,8 @@
37#include <asm/cpufeature.h> 37#include <asm/cpufeature.h>
38#include <asm/intel-family.h> 38#include <asm/intel-family.h>
39 39
40#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
41
40#define ATOM_RATIOS 0x66a 42#define ATOM_RATIOS 0x66a
41#define ATOM_VIDS 0x66b 43#define ATOM_VIDS 0x66b
42#define ATOM_TURBO_RATIOS 0x66c 44#define ATOM_TURBO_RATIOS 0x66c
@@ -53,6 +55,8 @@
53 55
54#define EXT_BITS 6 56#define EXT_BITS 6
55#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) 57#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
58#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
59#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
56 60
57static inline int32_t mul_fp(int32_t x, int32_t y) 61static inline int32_t mul_fp(int32_t x, int32_t y)
58{ 62{
@@ -123,6 +127,8 @@ struct sample {
123 * @scaling: Scaling factor to convert frequency to cpufreq 127 * @scaling: Scaling factor to convert frequency to cpufreq
124 * frequency units 128 * frequency units
125 * @turbo_pstate: Max Turbo P state possible for this platform 129 * @turbo_pstate: Max Turbo P state possible for this platform
130 * @max_freq: @max_pstate frequency in cpufreq units
131 * @turbo_freq: @turbo_pstate frequency in cpufreq units
126 * 132 *
127 * Stores the per cpu model P state limits and current P state. 133 * Stores the per cpu model P state limits and current P state.
128 */ 134 */
@@ -133,6 +139,8 @@ struct pstate_data {
133 int max_pstate_physical; 139 int max_pstate_physical;
134 int scaling; 140 int scaling;
135 int turbo_pstate; 141 int turbo_pstate;
142 unsigned int max_freq;
143 unsigned int turbo_freq;
136}; 144};
137 145
138/** 146/**
@@ -178,6 +186,48 @@ struct _pid {
178}; 186};
179 187
180/** 188/**
189 * struct perf_limits - Store user and policy limits
190 * @no_turbo: User requested turbo state from intel_pstate sysfs
191 * @turbo_disabled: Platform turbo status either from msr
192 * MSR_IA32_MISC_ENABLE or when maximum available pstate
193 * matches the maximum turbo pstate
194 * @max_perf_pct: Effective maximum performance limit in percentage, this
195 * is minimum of either limits enforced by cpufreq policy
196 * or limits from user set limits via intel_pstate sysfs
197 * @min_perf_pct: Effective minimum performance limit in percentage, this
198 * is maximum of either limits enforced by cpufreq policy
199 * or limits from user set limits via intel_pstate sysfs
200 * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct
201 * This value is used to limit max pstate
202 * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct
203 * This value is used to limit min pstate
204 * @max_policy_pct: The maximum performance in percentage enforced by
205 * cpufreq setpolicy interface
206 * @max_sysfs_pct: The maximum performance in percentage enforced by
207 * intel pstate sysfs interface, unused when per cpu
208 * controls are enforced
209 * @min_policy_pct: The minimum performance in percentage enforced by
210 * cpufreq setpolicy interface
211 * @min_sysfs_pct: The minimum performance in percentage enforced by
212 * intel pstate sysfs interface, unused when per cpu
213 * controls are enforced
214 *
215 * Storage for user and policy defined limits.
216 */
217struct perf_limits {
218 int no_turbo;
219 int turbo_disabled;
220 int max_perf_pct;
221 int min_perf_pct;
222 int32_t max_perf;
223 int32_t min_perf;
224 int max_policy_pct;
225 int max_sysfs_pct;
226 int min_policy_pct;
227 int min_sysfs_pct;
228};
229
230/**
181 * struct cpudata - Per CPU instance data storage 231 * struct cpudata - Per CPU instance data storage
182 * @cpu: CPU number for this instance data 232 * @cpu: CPU number for this instance data
183 * @policy: CPUFreq policy value 233 * @policy: CPUFreq policy value
@@ -195,8 +245,19 @@ struct _pid {
195 * @prev_cummulative_iowait: IO Wait time difference from last and 245 * @prev_cummulative_iowait: IO Wait time difference from last and
196 * current sample 246 * current sample
197 * @sample: Storage for storing last Sample data 247 * @sample: Storage for storing last Sample data
248 * @perf_limits: Pointer to perf_limit unique to this CPU
249 * Not all field in the structure are applicable
250 * when per cpu controls are enforced
198 * @acpi_perf_data: Stores ACPI perf information read from _PSS 251 * @acpi_perf_data: Stores ACPI perf information read from _PSS
199 * @valid_pss_table: Set to true for valid ACPI _PSS entries found 252 * @valid_pss_table: Set to true for valid ACPI _PSS entries found
253 * @epp_powersave: Last saved HWP energy performance preference
254 * (EPP) or energy performance bias (EPB),
255 * when policy switched to performance
256 * @epp_policy: Last saved policy used to set EPP/EPB
257 * @epp_default: Power on default HWP energy performance
258 * preference/bias
259 * @epp_saved: Saved EPP/EPB during system suspend or CPU offline
260 * operation
200 * 261 *
201 * This structure stores per CPU instance data for all CPUs. 262 * This structure stores per CPU instance data for all CPUs.
202 */ 263 */
@@ -218,11 +279,16 @@ struct cpudata {
218 u64 prev_tsc; 279 u64 prev_tsc;
219 u64 prev_cummulative_iowait; 280 u64 prev_cummulative_iowait;
220 struct sample sample; 281 struct sample sample;
282 struct perf_limits *perf_limits;
221#ifdef CONFIG_ACPI 283#ifdef CONFIG_ACPI
222 struct acpi_processor_performance acpi_perf_data; 284 struct acpi_processor_performance acpi_perf_data;
223 bool valid_pss_table; 285 bool valid_pss_table;
224#endif 286#endif
225 unsigned int iowait_boost; 287 unsigned int iowait_boost;
288 s16 epp_powersave;
289 s16 epp_policy;
290 s16 epp_default;
291 s16 epp_saved;
226}; 292};
227 293
228static struct cpudata **all_cpu_data; 294static struct cpudata **all_cpu_data;
@@ -236,7 +302,6 @@ static struct cpudata **all_cpu_data;
236 * @p_gain_pct: PID proportional gain 302 * @p_gain_pct: PID proportional gain
237 * @i_gain_pct: PID integral gain 303 * @i_gain_pct: PID integral gain
238 * @d_gain_pct: PID derivative gain 304 * @d_gain_pct: PID derivative gain
239 * @boost_iowait: Whether or not to use iowait boosting.
240 * 305 *
241 * Stores per CPU model static PID configuration data. 306 * Stores per CPU model static PID configuration data.
242 */ 307 */
@@ -248,7 +313,6 @@ struct pstate_adjust_policy {
248 int p_gain_pct; 313 int p_gain_pct;
249 int d_gain_pct; 314 int d_gain_pct;
250 int i_gain_pct; 315 int i_gain_pct;
251 bool boost_iowait;
252}; 316};
253 317
254/** 318/**
@@ -292,58 +356,19 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
292static struct pstate_adjust_policy pid_params __read_mostly; 356static struct pstate_adjust_policy pid_params __read_mostly;
293static struct pstate_funcs pstate_funcs __read_mostly; 357static struct pstate_funcs pstate_funcs __read_mostly;
294static int hwp_active __read_mostly; 358static int hwp_active __read_mostly;
359static bool per_cpu_limits __read_mostly;
295 360
296#ifdef CONFIG_ACPI 361#ifdef CONFIG_ACPI
297static bool acpi_ppc; 362static bool acpi_ppc;
298#endif 363#endif
299 364
300/**
301 * struct perf_limits - Store user and policy limits
302 * @no_turbo: User requested turbo state from intel_pstate sysfs
303 * @turbo_disabled: Platform turbo status either from msr
304 * MSR_IA32_MISC_ENABLE or when maximum available pstate
305 * matches the maximum turbo pstate
306 * @max_perf_pct: Effective maximum performance limit in percentage, this
307 * is minimum of either limits enforced by cpufreq policy
308 * or limits from user set limits via intel_pstate sysfs
309 * @min_perf_pct: Effective minimum performance limit in percentage, this
310 * is maximum of either limits enforced by cpufreq policy
311 * or limits from user set limits via intel_pstate sysfs
312 * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct
313 * This value is used to limit max pstate
314 * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct
315 * This value is used to limit min pstate
316 * @max_policy_pct: The maximum performance in percentage enforced by
317 * cpufreq setpolicy interface
318 * @max_sysfs_pct: The maximum performance in percentage enforced by
319 * intel pstate sysfs interface
320 * @min_policy_pct: The minimum performance in percentage enforced by
321 * cpufreq setpolicy interface
322 * @min_sysfs_pct: The minimum performance in percentage enforced by
323 * intel pstate sysfs interface
324 *
325 * Storage for user and policy defined limits.
326 */
327struct perf_limits {
328 int no_turbo;
329 int turbo_disabled;
330 int max_perf_pct;
331 int min_perf_pct;
332 int32_t max_perf;
333 int32_t min_perf;
334 int max_policy_pct;
335 int max_sysfs_pct;
336 int min_policy_pct;
337 int min_sysfs_pct;
338};
339
340static struct perf_limits performance_limits = { 365static struct perf_limits performance_limits = {
341 .no_turbo = 0, 366 .no_turbo = 0,
342 .turbo_disabled = 0, 367 .turbo_disabled = 0,
343 .max_perf_pct = 100, 368 .max_perf_pct = 100,
344 .max_perf = int_tofp(1), 369 .max_perf = int_ext_tofp(1),
345 .min_perf_pct = 100, 370 .min_perf_pct = 100,
346 .min_perf = int_tofp(1), 371 .min_perf = int_ext_tofp(1),
347 .max_policy_pct = 100, 372 .max_policy_pct = 100,
348 .max_sysfs_pct = 100, 373 .max_sysfs_pct = 100,
349 .min_policy_pct = 0, 374 .min_policy_pct = 0,
@@ -354,7 +379,7 @@ static struct perf_limits powersave_limits = {
354 .no_turbo = 0, 379 .no_turbo = 0,
355 .turbo_disabled = 0, 380 .turbo_disabled = 0,
356 .max_perf_pct = 100, 381 .max_perf_pct = 100,
357 .max_perf = int_tofp(1), 382 .max_perf = int_ext_tofp(1),
358 .min_perf_pct = 0, 383 .min_perf_pct = 0,
359 .min_perf = 0, 384 .min_perf = 0,
360 .max_policy_pct = 100, 385 .max_policy_pct = 100,
@@ -369,6 +394,8 @@ static struct perf_limits *limits = &performance_limits;
369static struct perf_limits *limits = &powersave_limits; 394static struct perf_limits *limits = &powersave_limits;
370#endif 395#endif
371 396
397static DEFINE_MUTEX(intel_pstate_limits_lock);
398
372#ifdef CONFIG_ACPI 399#ifdef CONFIG_ACPI
373 400
374static bool intel_pstate_get_ppc_enable_status(void) 401static bool intel_pstate_get_ppc_enable_status(void)
@@ -513,11 +540,11 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
513} 540}
514 541
515#else 542#else
516static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 543static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
517{ 544{
518} 545}
519 546
520static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) 547static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
521{ 548{
522} 549}
523#endif 550#endif
@@ -613,24 +640,252 @@ static inline void update_turbo_state(void)
613 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 640 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
614} 641}
615 642
643static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
644{
645 u64 epb;
646 int ret;
647
648 if (!static_cpu_has(X86_FEATURE_EPB))
649 return -ENXIO;
650
651 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
652 if (ret)
653 return (s16)ret;
654
655 return (s16)(epb & 0x0f);
656}
657
658static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
659{
660 s16 epp;
661
662 if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
663 /*
664 * When hwp_req_data is 0, means that caller didn't read
665 * MSR_HWP_REQUEST, so need to read and get EPP.
666 */
667 if (!hwp_req_data) {
668 epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
669 &hwp_req_data);
670 if (epp)
671 return epp;
672 }
673 epp = (hwp_req_data >> 24) & 0xff;
674 } else {
675 /* When there is no EPP present, HWP uses EPB settings */
676 epp = intel_pstate_get_epb(cpu_data);
677 }
678
679 return epp;
680}
681
682static int intel_pstate_set_epb(int cpu, s16 pref)
683{
684 u64 epb;
685 int ret;
686
687 if (!static_cpu_has(X86_FEATURE_EPB))
688 return -ENXIO;
689
690 ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
691 if (ret)
692 return ret;
693
694 epb = (epb & ~0x0f) | pref;
695 wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
696
697 return 0;
698}
699
700/*
701 * EPP/EPB display strings corresponding to EPP index in the
702 * energy_perf_strings[]
703 * index String
704 *-------------------------------------
705 * 0 default
706 * 1 performance
707 * 2 balance_performance
708 * 3 balance_power
709 * 4 power
710 */
711static const char * const energy_perf_strings[] = {
712 "default",
713 "performance",
714 "balance_performance",
715 "balance_power",
716 "power",
717 NULL
718};
719
720static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data)
721{
722 s16 epp;
723 int index = -EINVAL;
724
725 epp = intel_pstate_get_epp(cpu_data, 0);
726 if (epp < 0)
727 return epp;
728
729 if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
730 /*
731 * Range:
732 * 0x00-0x3F : Performance
733 * 0x40-0x7F : Balance performance
734 * 0x80-0xBF : Balance power
735 * 0xC0-0xFF : Power
736 * The EPP is a 8 bit value, but our ranges restrict the
737 * value which can be set. Here only using top two bits
738 * effectively.
739 */
740 index = (epp >> 6) + 1;
741 } else if (static_cpu_has(X86_FEATURE_EPB)) {
742 /*
743 * Range:
744 * 0x00-0x03 : Performance
745 * 0x04-0x07 : Balance performance
746 * 0x08-0x0B : Balance power
747 * 0x0C-0x0F : Power
748 * The EPB is a 4 bit value, but our ranges restrict the
749 * value which can be set. Here only using top two bits
750 * effectively.
751 */
752 index = (epp >> 2) + 1;
753 }
754
755 return index;
756}
757
758static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
759 int pref_index)
760{
761 int epp = -EINVAL;
762 int ret;
763
764 if (!pref_index)
765 epp = cpu_data->epp_default;
766
767 mutex_lock(&intel_pstate_limits_lock);
768
769 if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
770 u64 value;
771
772 ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value);
773 if (ret)
774 goto return_pref;
775
776 value &= ~GENMASK_ULL(31, 24);
777
778 /*
779 * If epp is not default, convert from index into
780 * energy_perf_strings to epp value, by shifting 6
781 * bits left to use only top two bits in epp.
782 * The resultant epp need to shifted by 24 bits to
783 * epp position in MSR_HWP_REQUEST.
784 */
785 if (epp == -EINVAL)
786 epp = (pref_index - 1) << 6;
787
788 value |= (u64)epp << 24;
789 ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value);
790 } else {
791 if (epp == -EINVAL)
792 epp = (pref_index - 1) << 2;
793 ret = intel_pstate_set_epb(cpu_data->cpu, epp);
794 }
795return_pref:
796 mutex_unlock(&intel_pstate_limits_lock);
797
798 return ret;
799}
800
801static ssize_t show_energy_performance_available_preferences(
802 struct cpufreq_policy *policy, char *buf)
803{
804 int i = 0;
805 int ret = 0;
806
807 while (energy_perf_strings[i] != NULL)
808 ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
809
810 ret += sprintf(&buf[ret], "\n");
811
812 return ret;
813}
814
815cpufreq_freq_attr_ro(energy_performance_available_preferences);
816
817static ssize_t store_energy_performance_preference(
818 struct cpufreq_policy *policy, const char *buf, size_t count)
819{
820 struct cpudata *cpu_data = all_cpu_data[policy->cpu];
821 char str_preference[21];
822 int ret, i = 0;
823
824 ret = sscanf(buf, "%20s", str_preference);
825 if (ret != 1)
826 return -EINVAL;
827
828 while (energy_perf_strings[i] != NULL) {
829 if (!strcmp(str_preference, energy_perf_strings[i])) {
830 intel_pstate_set_energy_pref_index(cpu_data, i);
831 return count;
832 }
833 ++i;
834 }
835
836 return -EINVAL;
837}
838
839static ssize_t show_energy_performance_preference(
840 struct cpufreq_policy *policy, char *buf)
841{
842 struct cpudata *cpu_data = all_cpu_data[policy->cpu];
843 int preference;
844
845 preference = intel_pstate_get_energy_pref_index(cpu_data);
846 if (preference < 0)
847 return preference;
848
849 return sprintf(buf, "%s\n", energy_perf_strings[preference]);
850}
851
852cpufreq_freq_attr_rw(energy_performance_preference);
853
854static struct freq_attr *hwp_cpufreq_attrs[] = {
855 &energy_performance_preference,
856 &energy_performance_available_preferences,
857 NULL,
858};
859
616static void intel_pstate_hwp_set(const struct cpumask *cpumask) 860static void intel_pstate_hwp_set(const struct cpumask *cpumask)
617{ 861{
618 int min, hw_min, max, hw_max, cpu, range, adj_range; 862 int min, hw_min, max, hw_max, cpu, range, adj_range;
863 struct perf_limits *perf_limits = limits;
619 u64 value, cap; 864 u64 value, cap;
620 865
621 for_each_cpu(cpu, cpumask) { 866 for_each_cpu(cpu, cpumask) {
867 int max_perf_pct, min_perf_pct;
868 struct cpudata *cpu_data = all_cpu_data[cpu];
869 s16 epp;
870
871 if (per_cpu_limits)
872 perf_limits = all_cpu_data[cpu]->perf_limits;
873
622 rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); 874 rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
623 hw_min = HWP_LOWEST_PERF(cap); 875 hw_min = HWP_LOWEST_PERF(cap);
624 hw_max = HWP_HIGHEST_PERF(cap); 876 hw_max = HWP_HIGHEST_PERF(cap);
625 range = hw_max - hw_min; 877 range = hw_max - hw_min;
626 878
879 max_perf_pct = perf_limits->max_perf_pct;
880 min_perf_pct = perf_limits->min_perf_pct;
881
627 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 882 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
628 adj_range = limits->min_perf_pct * range / 100; 883 adj_range = min_perf_pct * range / 100;
629 min = hw_min + adj_range; 884 min = hw_min + adj_range;
630 value &= ~HWP_MIN_PERF(~0L); 885 value &= ~HWP_MIN_PERF(~0L);
631 value |= HWP_MIN_PERF(min); 886 value |= HWP_MIN_PERF(min);
632 887
633 adj_range = limits->max_perf_pct * range / 100; 888 adj_range = max_perf_pct * range / 100;
634 max = hw_min + adj_range; 889 max = hw_min + adj_range;
635 if (limits->no_turbo) { 890 if (limits->no_turbo) {
636 hw_max = HWP_GUARANTEED_PERF(cap); 891 hw_max = HWP_GUARANTEED_PERF(cap);
@@ -640,6 +895,53 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
640 895
641 value &= ~HWP_MAX_PERF(~0L); 896 value &= ~HWP_MAX_PERF(~0L);
642 value |= HWP_MAX_PERF(max); 897 value |= HWP_MAX_PERF(max);
898
899 if (cpu_data->epp_policy == cpu_data->policy)
900 goto skip_epp;
901
902 cpu_data->epp_policy = cpu_data->policy;
903
904 if (cpu_data->epp_saved >= 0) {
905 epp = cpu_data->epp_saved;
906 cpu_data->epp_saved = -EINVAL;
907 goto update_epp;
908 }
909
910 if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
911 epp = intel_pstate_get_epp(cpu_data, value);
912 cpu_data->epp_powersave = epp;
913 /* If EPP read was failed, then don't try to write */
914 if (epp < 0)
915 goto skip_epp;
916
917
918 epp = 0;
919 } else {
920 /* skip setting EPP, when saved value is invalid */
921 if (cpu_data->epp_powersave < 0)
922 goto skip_epp;
923
924 /*
925 * No need to restore EPP when it is not zero. This
926 * means:
927 * - Policy is not changed
928 * - user has manually changed
929 * - Error reading EPB
930 */
931 epp = intel_pstate_get_epp(cpu_data, value);
932 if (epp)
933 goto skip_epp;
934
935 epp = cpu_data->epp_powersave;
936 }
937update_epp:
938 if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
939 value &= ~GENMASK_ULL(31, 24);
940 value |= (u64)epp << 24;
941 } else {
942 intel_pstate_set_epb(cpu, epp);
943 }
944skip_epp:
643 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 945 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
644 } 946 }
645} 947}
@@ -652,6 +954,28 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
652 return 0; 954 return 0;
653} 955}
654 956
957static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
958{
959 struct cpudata *cpu_data = all_cpu_data[policy->cpu];
960
961 if (!hwp_active)
962 return 0;
963
964 cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
965
966 return 0;
967}
968
969static int intel_pstate_resume(struct cpufreq_policy *policy)
970{
971 if (!hwp_active)
972 return 0;
973
974 all_cpu_data[policy->cpu]->epp_policy = 0;
975
976 return intel_pstate_hwp_set_policy(policy);
977}
978
655static void intel_pstate_hwp_set_online_cpus(void) 979static void intel_pstate_hwp_set_online_cpus(void)
656{ 980{
657 get_online_cpus(); 981 get_online_cpus();
@@ -694,8 +1018,10 @@ static void __init intel_pstate_debug_expose_params(void)
694 struct dentry *debugfs_parent; 1018 struct dentry *debugfs_parent;
695 int i = 0; 1019 int i = 0;
696 1020
697 if (hwp_active) 1021 if (hwp_active ||
1022 pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load)
698 return; 1023 return;
1024
699 debugfs_parent = debugfs_create_dir("pstate_snb", NULL); 1025 debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
700 if (IS_ERR_OR_NULL(debugfs_parent)) 1026 if (IS_ERR_OR_NULL(debugfs_parent))
701 return; 1027 return;
@@ -768,9 +1094,12 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
768 if (ret != 1) 1094 if (ret != 1)
769 return -EINVAL; 1095 return -EINVAL;
770 1096
1097 mutex_lock(&intel_pstate_limits_lock);
1098
771 update_turbo_state(); 1099 update_turbo_state();
772 if (limits->turbo_disabled) { 1100 if (limits->turbo_disabled) {
773 pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); 1101 pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
1102 mutex_unlock(&intel_pstate_limits_lock);
774 return -EPERM; 1103 return -EPERM;
775 } 1104 }
776 1105
@@ -779,6 +1108,8 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
779 if (hwp_active) 1108 if (hwp_active)
780 intel_pstate_hwp_set_online_cpus(); 1109 intel_pstate_hwp_set_online_cpus();
781 1110
1111 mutex_unlock(&intel_pstate_limits_lock);
1112
782 return count; 1113 return count;
783} 1114}
784 1115
@@ -792,6 +1123,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
792 if (ret != 1) 1123 if (ret != 1)
793 return -EINVAL; 1124 return -EINVAL;
794 1125
1126 mutex_lock(&intel_pstate_limits_lock);
1127
795 limits->max_sysfs_pct = clamp_t(int, input, 0 , 100); 1128 limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
796 limits->max_perf_pct = min(limits->max_policy_pct, 1129 limits->max_perf_pct = min(limits->max_policy_pct,
797 limits->max_sysfs_pct); 1130 limits->max_sysfs_pct);
@@ -799,10 +1132,13 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
799 limits->max_perf_pct); 1132 limits->max_perf_pct);
800 limits->max_perf_pct = max(limits->min_perf_pct, 1133 limits->max_perf_pct = max(limits->min_perf_pct,
801 limits->max_perf_pct); 1134 limits->max_perf_pct);
802 limits->max_perf = div_fp(limits->max_perf_pct, 100); 1135 limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
803 1136
804 if (hwp_active) 1137 if (hwp_active)
805 intel_pstate_hwp_set_online_cpus(); 1138 intel_pstate_hwp_set_online_cpus();
1139
1140 mutex_unlock(&intel_pstate_limits_lock);
1141
806 return count; 1142 return count;
807} 1143}
808 1144
@@ -816,6 +1152,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
816 if (ret != 1) 1152 if (ret != 1)
817 return -EINVAL; 1153 return -EINVAL;
818 1154
1155 mutex_lock(&intel_pstate_limits_lock);
1156
819 limits->min_sysfs_pct = clamp_t(int, input, 0 , 100); 1157 limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
820 limits->min_perf_pct = max(limits->min_policy_pct, 1158 limits->min_perf_pct = max(limits->min_policy_pct,
821 limits->min_sysfs_pct); 1159 limits->min_sysfs_pct);
@@ -823,10 +1161,13 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
823 limits->min_perf_pct); 1161 limits->min_perf_pct);
824 limits->min_perf_pct = min(limits->max_perf_pct, 1162 limits->min_perf_pct = min(limits->max_perf_pct,
825 limits->min_perf_pct); 1163 limits->min_perf_pct);
826 limits->min_perf = div_fp(limits->min_perf_pct, 100); 1164 limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
827 1165
828 if (hwp_active) 1166 if (hwp_active)
829 intel_pstate_hwp_set_online_cpus(); 1167 intel_pstate_hwp_set_online_cpus();
1168
1169 mutex_unlock(&intel_pstate_limits_lock);
1170
830 return count; 1171 return count;
831} 1172}
832 1173
@@ -841,8 +1182,6 @@ define_one_global_ro(num_pstates);
841 1182
842static struct attribute *intel_pstate_attributes[] = { 1183static struct attribute *intel_pstate_attributes[] = {
843 &no_turbo.attr, 1184 &no_turbo.attr,
844 &max_perf_pct.attr,
845 &min_perf_pct.attr,
846 &turbo_pct.attr, 1185 &turbo_pct.attr,
847 &num_pstates.attr, 1186 &num_pstates.attr,
848 NULL 1187 NULL
@@ -859,9 +1198,26 @@ static void __init intel_pstate_sysfs_expose_params(void)
859 1198
860 intel_pstate_kobject = kobject_create_and_add("intel_pstate", 1199 intel_pstate_kobject = kobject_create_and_add("intel_pstate",
861 &cpu_subsys.dev_root->kobj); 1200 &cpu_subsys.dev_root->kobj);
862 BUG_ON(!intel_pstate_kobject); 1201 if (WARN_ON(!intel_pstate_kobject))
1202 return;
1203
863 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); 1204 rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
864 BUG_ON(rc); 1205 if (WARN_ON(rc))
1206 return;
1207
1208 /*
1209 * If per cpu limits are enforced there are no global limits, so
1210 * return without creating max/min_perf_pct attributes
1211 */
1212 if (per_cpu_limits)
1213 return;
1214
1215 rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
1216 WARN_ON(rc);
1217
1218 rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
1219 WARN_ON(rc);
1220
865} 1221}
866/************************** sysfs end ************************/ 1222/************************** sysfs end ************************/
867 1223
@@ -872,6 +1228,9 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
872 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); 1228 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
873 1229
874 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 1230 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
1231 cpudata->epp_policy = 0;
1232 if (cpudata->epp_default == -EINVAL)
1233 cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
875} 1234}
876 1235
877static int atom_get_min_pstate(void) 1236static int atom_get_min_pstate(void)
@@ -1099,7 +1458,6 @@ static const struct cpu_defaults silvermont_params = {
1099 .p_gain_pct = 14, 1458 .p_gain_pct = 14,
1100 .d_gain_pct = 0, 1459 .d_gain_pct = 0,
1101 .i_gain_pct = 4, 1460 .i_gain_pct = 4,
1102 .boost_iowait = true,
1103 }, 1461 },
1104 .funcs = { 1462 .funcs = {
1105 .get_max = atom_get_max_pstate, 1463 .get_max = atom_get_max_pstate,
@@ -1121,7 +1479,6 @@ static const struct cpu_defaults airmont_params = {
1121 .p_gain_pct = 14, 1479 .p_gain_pct = 14,
1122 .d_gain_pct = 0, 1480 .d_gain_pct = 0,
1123 .i_gain_pct = 4, 1481 .i_gain_pct = 4,
1124 .boost_iowait = true,
1125 }, 1482 },
1126 .funcs = { 1483 .funcs = {
1127 .get_max = atom_get_max_pstate, 1484 .get_max = atom_get_max_pstate,
@@ -1163,7 +1520,6 @@ static const struct cpu_defaults bxt_params = {
1163 .p_gain_pct = 14, 1520 .p_gain_pct = 14,
1164 .d_gain_pct = 0, 1521 .d_gain_pct = 0,
1165 .i_gain_pct = 4, 1522 .i_gain_pct = 4,
1166 .boost_iowait = true,
1167 }, 1523 },
1168 .funcs = { 1524 .funcs = {
1169 .get_max = core_get_max_pstate, 1525 .get_max = core_get_max_pstate,
@@ -1181,20 +1537,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
1181 int max_perf = cpu->pstate.turbo_pstate; 1537 int max_perf = cpu->pstate.turbo_pstate;
1182 int max_perf_adj; 1538 int max_perf_adj;
1183 int min_perf; 1539 int min_perf;
1540 struct perf_limits *perf_limits = limits;
1184 1541
1185 if (limits->no_turbo || limits->turbo_disabled) 1542 if (limits->no_turbo || limits->turbo_disabled)
1186 max_perf = cpu->pstate.max_pstate; 1543 max_perf = cpu->pstate.max_pstate;
1187 1544
1545 if (per_cpu_limits)
1546 perf_limits = cpu->perf_limits;
1547
1188 /* 1548 /*
1189 * performance can be limited by user through sysfs, by cpufreq 1549 * performance can be limited by user through sysfs, by cpufreq
1190 * policy, or by cpu specific default values determined through 1550 * policy, or by cpu specific default values determined through
1191 * experimentation. 1551 * experimentation.
1192 */ 1552 */
1193 max_perf_adj = fp_toint(max_perf * limits->max_perf); 1553 max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf);
1194 *max = clamp_t(int, max_perf_adj, 1554 *max = clamp_t(int, max_perf_adj,
1195 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); 1555 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
1196 1556
1197 min_perf = fp_toint(max_perf * limits->min_perf); 1557 min_perf = fp_ext_toint(max_perf * perf_limits->min_perf);
1198 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); 1558 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
1199} 1559}
1200 1560
@@ -1232,6 +1592,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
1232 cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); 1592 cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
1233 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); 1593 cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
1234 cpu->pstate.scaling = pstate_funcs.get_scaling(); 1594 cpu->pstate.scaling = pstate_funcs.get_scaling();
1595 cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
1596 cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
1235 1597
1236 if (pstate_funcs.get_vid) 1598 if (pstate_funcs.get_vid)
1237 pstate_funcs.get_vid(cpu); 1599 pstate_funcs.get_vid(cpu);
@@ -1370,15 +1732,19 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
1370 return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled); 1732 return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
1371} 1733}
1372 1734
1373static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) 1735static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
1374{ 1736{
1375 int max_perf, min_perf; 1737 int max_perf, min_perf;
1376 1738
1377 update_turbo_state();
1378
1379 intel_pstate_get_min_max(cpu, &min_perf, &max_perf); 1739 intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
1380 pstate = clamp_t(int, pstate, min_perf, max_perf); 1740 pstate = clamp_t(int, pstate, min_perf, max_perf);
1381 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); 1741 trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
1742 return pstate;
1743}
1744
1745static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
1746{
1747 pstate = intel_pstate_prepare_request(cpu, pstate);
1382 if (pstate == cpu->pstate.current_pstate) 1748 if (pstate == cpu->pstate.current_pstate)
1383 return; 1749 return;
1384 1750
@@ -1396,6 +1762,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
1396 target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ? 1762 target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
1397 cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu); 1763 cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);
1398 1764
1765 update_turbo_state();
1766
1399 intel_pstate_update_pstate(cpu, target_pstate); 1767 intel_pstate_update_pstate(cpu, target_pstate);
1400 1768
1401 sample = &cpu->sample; 1769 sample = &cpu->sample;
@@ -1416,7 +1784,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
1416 struct cpudata *cpu = container_of(data, struct cpudata, update_util); 1784 struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1417 u64 delta_ns; 1785 u64 delta_ns;
1418 1786
1419 if (pid_params.boost_iowait) { 1787 if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) {
1420 if (flags & SCHED_CPUFREQ_IOWAIT) { 1788 if (flags & SCHED_CPUFREQ_IOWAIT) {
1421 cpu->iowait_boost = int_tofp(1); 1789 cpu->iowait_boost = int_tofp(1);
1422 } else if (cpu->iowait_boost) { 1790 } else if (cpu->iowait_boost) {
@@ -1462,6 +1830,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
1462 ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_params), 1830 ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_params),
1463 ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params), 1831 ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params),
1464 ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_params), 1832 ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_params),
1833 ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_params),
1465 ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_params), 1834 ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_params),
1466 {} 1835 {}
1467}; 1836};
@@ -1478,11 +1847,26 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
1478{ 1847{
1479 struct cpudata *cpu; 1848 struct cpudata *cpu;
1480 1849
1481 if (!all_cpu_data[cpunum]) 1850 cpu = all_cpu_data[cpunum];
1482 all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), 1851
1483 GFP_KERNEL); 1852 if (!cpu) {
1484 if (!all_cpu_data[cpunum]) 1853 unsigned int size = sizeof(struct cpudata);
1485 return -ENOMEM; 1854
1855 if (per_cpu_limits)
1856 size += sizeof(struct perf_limits);
1857
1858 cpu = kzalloc(size, GFP_KERNEL);
1859 if (!cpu)
1860 return -ENOMEM;
1861
1862 all_cpu_data[cpunum] = cpu;
1863 if (per_cpu_limits)
1864 cpu->perf_limits = (struct perf_limits *)(cpu + 1);
1865
1866 cpu->epp_default = -EINVAL;
1867 cpu->epp_powersave = -EINVAL;
1868 cpu->epp_saved = -EINVAL;
1869 }
1486 1870
1487 cpu = all_cpu_data[cpunum]; 1871 cpu = all_cpu_data[cpunum];
1488 1872
@@ -1541,18 +1925,57 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
1541 limits->no_turbo = 0; 1925 limits->no_turbo = 0;
1542 limits->turbo_disabled = 0; 1926 limits->turbo_disabled = 0;
1543 limits->max_perf_pct = 100; 1927 limits->max_perf_pct = 100;
1544 limits->max_perf = int_tofp(1); 1928 limits->max_perf = int_ext_tofp(1);
1545 limits->min_perf_pct = 100; 1929 limits->min_perf_pct = 100;
1546 limits->min_perf = int_tofp(1); 1930 limits->min_perf = int_ext_tofp(1);
1547 limits->max_policy_pct = 100; 1931 limits->max_policy_pct = 100;
1548 limits->max_sysfs_pct = 100; 1932 limits->max_sysfs_pct = 100;
1549 limits->min_policy_pct = 0; 1933 limits->min_policy_pct = 0;
1550 limits->min_sysfs_pct = 0; 1934 limits->min_sysfs_pct = 0;
1551} 1935}
1552 1936
1937static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
1938 struct perf_limits *limits)
1939{
1940
1941 limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
1942 policy->cpuinfo.max_freq);
1943 limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
1944 if (policy->max == policy->min) {
1945 limits->min_policy_pct = limits->max_policy_pct;
1946 } else {
1947 limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100,
1948 policy->cpuinfo.max_freq);
1949 limits->min_policy_pct = clamp_t(int, limits->min_policy_pct,
1950 0, 100);
1951 }
1952
1953 /* Normalize user input to [min_policy_pct, max_policy_pct] */
1954 limits->min_perf_pct = max(limits->min_policy_pct,
1955 limits->min_sysfs_pct);
1956 limits->min_perf_pct = min(limits->max_policy_pct,
1957 limits->min_perf_pct);
1958 limits->max_perf_pct = min(limits->max_policy_pct,
1959 limits->max_sysfs_pct);
1960 limits->max_perf_pct = max(limits->min_policy_pct,
1961 limits->max_perf_pct);
1962
1963 /* Make sure min_perf_pct <= max_perf_pct */
1964 limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
1965
1966 limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
1967 limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
1968 limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
1969 limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
1970
1971 pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
1972 limits->max_perf_pct, limits->min_perf_pct);
1973}
1974
1553static int intel_pstate_set_policy(struct cpufreq_policy *policy) 1975static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1554{ 1976{
1555 struct cpudata *cpu; 1977 struct cpudata *cpu;
1978 struct perf_limits *perf_limits = NULL;
1556 1979
1557 if (!policy->cpuinfo.max_freq) 1980 if (!policy->cpuinfo.max_freq)
1558 return -ENODEV; 1981 return -ENODEV;
@@ -1570,41 +1993,31 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1570 policy->max = policy->cpuinfo.max_freq; 1993 policy->max = policy->cpuinfo.max_freq;
1571 } 1994 }
1572 1995
1573 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 1996 if (per_cpu_limits)
1574 limits = &performance_limits; 1997 perf_limits = cpu->perf_limits;
1998
1999 mutex_lock(&intel_pstate_limits_lock);
2000
2001 if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
2002 if (!perf_limits) {
2003 limits = &performance_limits;
2004 perf_limits = limits;
2005 }
1575 if (policy->max >= policy->cpuinfo.max_freq) { 2006 if (policy->max >= policy->cpuinfo.max_freq) {
1576 pr_debug("set performance\n"); 2007 pr_debug("set performance\n");
1577 intel_pstate_set_performance_limits(limits); 2008 intel_pstate_set_performance_limits(perf_limits);
1578 goto out; 2009 goto out;
1579 } 2010 }
1580 } else { 2011 } else {
1581 pr_debug("set powersave\n"); 2012 pr_debug("set powersave\n");
1582 limits = &powersave_limits; 2013 if (!perf_limits) {
1583 } 2014 limits = &powersave_limits;
1584 2015 perf_limits = limits;
1585 limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; 2016 }
1586 limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
1587 limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
1588 policy->cpuinfo.max_freq);
1589 limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
1590
1591 /* Normalize user input to [min_policy_pct, max_policy_pct] */
1592 limits->min_perf_pct = max(limits->min_policy_pct,
1593 limits->min_sysfs_pct);
1594 limits->min_perf_pct = min(limits->max_policy_pct,
1595 limits->min_perf_pct);
1596 limits->max_perf_pct = min(limits->max_policy_pct,
1597 limits->max_sysfs_pct);
1598 limits->max_perf_pct = max(limits->min_policy_pct,
1599 limits->max_perf_pct);
1600
1601 /* Make sure min_perf_pct <= max_perf_pct */
1602 limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
1603 2017
1604 limits->min_perf = div_fp(limits->min_perf_pct, 100); 2018 }
1605 limits->max_perf = div_fp(limits->max_perf_pct, 100);
1606 limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
1607 2019
2020 intel_pstate_update_perf_limits(policy, perf_limits);
1608 out: 2021 out:
1609 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { 2022 if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
1610 /* 2023 /*
@@ -1619,6 +2032,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1619 2032
1620 intel_pstate_hwp_set_policy(policy); 2033 intel_pstate_hwp_set_policy(policy);
1621 2034
2035 mutex_unlock(&intel_pstate_limits_lock);
2036
1622 return 0; 2037 return 0;
1623} 2038}
1624 2039
@@ -1633,22 +2048,32 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
1633 return 0; 2048 return 0;
1634} 2049}
1635 2050
2051static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
2052{
2053 intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
2054}
2055
1636static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) 2056static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
1637{ 2057{
1638 int cpu_num = policy->cpu; 2058 pr_debug("CPU %d exiting\n", policy->cpu);
1639 struct cpudata *cpu = all_cpu_data[cpu_num];
1640 2059
1641 pr_debug("CPU %d exiting\n", cpu_num); 2060 intel_pstate_clear_update_util_hook(policy->cpu);
2061 if (hwp_active)
2062 intel_pstate_hwp_save_state(policy);
2063 else
2064 intel_cpufreq_stop_cpu(policy);
2065}
1642 2066
1643 intel_pstate_clear_update_util_hook(cpu_num); 2067static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
2068{
2069 intel_pstate_exit_perf_limits(policy);
1644 2070
1645 if (hwp_active) 2071 policy->fast_switch_possible = false;
1646 return;
1647 2072
1648 intel_pstate_set_min_pstate(cpu); 2073 return 0;
1649} 2074}
1650 2075
1651static int intel_pstate_cpu_init(struct cpufreq_policy *policy) 2076static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
1652{ 2077{
1653 struct cpudata *cpu; 2078 struct cpudata *cpu;
1654 int rc; 2079 int rc;
@@ -1659,10 +2084,13 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
1659 2084
1660 cpu = all_cpu_data[policy->cpu]; 2085 cpu = all_cpu_data[policy->cpu];
1661 2086
1662 if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100) 2087 /*
1663 policy->policy = CPUFREQ_POLICY_PERFORMANCE; 2088 * We need sane value in the cpu->perf_limits, so inherit from global
1664 else 2089 * perf_limits limits, which are seeded with values based on the
1665 policy->policy = CPUFREQ_POLICY_POWERSAVE; 2090 * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up.
2091 */
2092 if (per_cpu_limits)
2093 memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits));
1666 2094
1667 policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; 2095 policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
1668 policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; 2096 policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
@@ -1675,24 +2103,35 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
1675 policy->cpuinfo.max_freq *= cpu->pstate.scaling; 2103 policy->cpuinfo.max_freq *= cpu->pstate.scaling;
1676 2104
1677 intel_pstate_init_acpi_perf_limits(policy); 2105 intel_pstate_init_acpi_perf_limits(policy);
1678 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
1679 cpumask_set_cpu(policy->cpu, policy->cpus); 2106 cpumask_set_cpu(policy->cpu, policy->cpus);
1680 2107
2108 policy->fast_switch_possible = true;
2109
1681 return 0; 2110 return 0;
1682} 2111}
1683 2112
1684static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) 2113static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
1685{ 2114{
1686 intel_pstate_exit_perf_limits(policy); 2115 int ret = __intel_pstate_cpu_init(policy);
2116
2117 if (ret)
2118 return ret;
2119
2120 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
2121 if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
2122 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
2123 else
2124 policy->policy = CPUFREQ_POLICY_POWERSAVE;
1687 2125
1688 return 0; 2126 return 0;
1689} 2127}
1690 2128
1691static struct cpufreq_driver intel_pstate_driver = { 2129static struct cpufreq_driver intel_pstate = {
1692 .flags = CPUFREQ_CONST_LOOPS, 2130 .flags = CPUFREQ_CONST_LOOPS,
1693 .verify = intel_pstate_verify_policy, 2131 .verify = intel_pstate_verify_policy,
1694 .setpolicy = intel_pstate_set_policy, 2132 .setpolicy = intel_pstate_set_policy,
1695 .resume = intel_pstate_hwp_set_policy, 2133 .suspend = intel_pstate_hwp_save_state,
2134 .resume = intel_pstate_resume,
1696 .get = intel_pstate_get, 2135 .get = intel_pstate_get,
1697 .init = intel_pstate_cpu_init, 2136 .init = intel_pstate_cpu_init,
1698 .exit = intel_pstate_cpu_exit, 2137 .exit = intel_pstate_cpu_exit,
@@ -1700,6 +2139,118 @@ static struct cpufreq_driver intel_pstate_driver = {
1700 .name = "intel_pstate", 2139 .name = "intel_pstate",
1701}; 2140};
1702 2141
2142static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
2143{
2144 struct cpudata *cpu = all_cpu_data[policy->cpu];
2145 struct perf_limits *perf_limits = limits;
2146
2147 update_turbo_state();
2148 policy->cpuinfo.max_freq = limits->turbo_disabled ?
2149 cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2150
2151 cpufreq_verify_within_cpu_limits(policy);
2152
2153 if (per_cpu_limits)
2154 perf_limits = cpu->perf_limits;
2155
2156 intel_pstate_update_perf_limits(policy, perf_limits);
2157
2158 return 0;
2159}
2160
2161static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu,
2162 struct cpufreq_policy *policy,
2163 unsigned int target_freq)
2164{
2165 unsigned int max_freq;
2166
2167 update_turbo_state();
2168
2169 max_freq = limits->no_turbo || limits->turbo_disabled ?
2170 cpu->pstate.max_freq : cpu->pstate.turbo_freq;
2171 policy->cpuinfo.max_freq = max_freq;
2172 if (policy->max > max_freq)
2173 policy->max = max_freq;
2174
2175 if (target_freq > max_freq)
2176 target_freq = max_freq;
2177
2178 return target_freq;
2179}
2180
2181static int intel_cpufreq_target(struct cpufreq_policy *policy,
2182 unsigned int target_freq,
2183 unsigned int relation)
2184{
2185 struct cpudata *cpu = all_cpu_data[policy->cpu];
2186 struct cpufreq_freqs freqs;
2187 int target_pstate;
2188
2189 freqs.old = policy->cur;
2190 freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq);
2191
2192 cpufreq_freq_transition_begin(policy, &freqs);
2193 switch (relation) {
2194 case CPUFREQ_RELATION_L:
2195 target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
2196 break;
2197 case CPUFREQ_RELATION_H:
2198 target_pstate = freqs.new / cpu->pstate.scaling;
2199 break;
2200 default:
2201 target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
2202 break;
2203 }
2204 target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
2205 if (target_pstate != cpu->pstate.current_pstate) {
2206 cpu->pstate.current_pstate = target_pstate;
2207 wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
2208 pstate_funcs.get_val(cpu, target_pstate));
2209 }
2210 cpufreq_freq_transition_end(policy, &freqs, false);
2211
2212 return 0;
2213}
2214
2215static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
2216 unsigned int target_freq)
2217{
2218 struct cpudata *cpu = all_cpu_data[policy->cpu];
2219 int target_pstate;
2220
2221 target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq);
2222 target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
2223 intel_pstate_update_pstate(cpu, target_pstate);
2224 return target_freq;
2225}
2226
2227static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
2228{
2229 int ret = __intel_pstate_cpu_init(policy);
2230
2231 if (ret)
2232 return ret;
2233
2234 policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
2235 /* This reflects the intel_pstate_get_cpu_pstates() setting. */
2236 policy->cur = policy->cpuinfo.min_freq;
2237
2238 return 0;
2239}
2240
2241static struct cpufreq_driver intel_cpufreq = {
2242 .flags = CPUFREQ_CONST_LOOPS,
2243 .verify = intel_cpufreq_verify_policy,
2244 .target = intel_cpufreq_target,
2245 .fast_switch = intel_cpufreq_fast_switch,
2246 .init = intel_cpufreq_cpu_init,
2247 .exit = intel_pstate_cpu_exit,
2248 .stop_cpu = intel_cpufreq_stop_cpu,
2249 .name = "intel_cpufreq",
2250};
2251
2252static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
2253
1703static int no_load __initdata; 2254static int no_load __initdata;
1704static int no_hwp __initdata; 2255static int no_hwp __initdata;
1705static int hwp_only __initdata; 2256static int hwp_only __initdata;
@@ -1726,6 +2277,19 @@ static void __init copy_pid_params(struct pstate_adjust_policy *policy)
1726 pid_params.setpoint = policy->setpoint; 2277 pid_params.setpoint = policy->setpoint;
1727} 2278}
1728 2279
2280#ifdef CONFIG_ACPI
2281static void intel_pstate_use_acpi_profile(void)
2282{
2283 if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
2284 pstate_funcs.get_target_pstate =
2285 get_target_pstate_use_cpu_load;
2286}
2287#else
2288static void intel_pstate_use_acpi_profile(void)
2289{
2290}
2291#endif
2292
1729static void __init copy_cpu_funcs(struct pstate_funcs *funcs) 2293static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
1730{ 2294{
1731 pstate_funcs.get_max = funcs->get_max; 2295 pstate_funcs.get_max = funcs->get_max;
@@ -1737,6 +2301,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
1737 pstate_funcs.get_vid = funcs->get_vid; 2301 pstate_funcs.get_vid = funcs->get_vid;
1738 pstate_funcs.get_target_pstate = funcs->get_target_pstate; 2302 pstate_funcs.get_target_pstate = funcs->get_target_pstate;
1739 2303
2304 intel_pstate_use_acpi_profile();
1740} 2305}
1741 2306
1742#ifdef CONFIG_ACPI 2307#ifdef CONFIG_ACPI
@@ -1850,9 +2415,20 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
1850 2415
1851 return false; 2416 return false;
1852} 2417}
2418
2419static void intel_pstate_request_control_from_smm(void)
2420{
2421 /*
2422 * It may be unsafe to request P-states control from SMM if _PPC support
2423 * has not been enabled.
2424 */
2425 if (acpi_ppc)
2426 acpi_processor_pstate_control();
2427}
1853#else /* CONFIG_ACPI not enabled */ 2428#else /* CONFIG_ACPI not enabled */
1854static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } 2429static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
1855static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 2430static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
2431static inline void intel_pstate_request_control_from_smm(void) {}
1856#endif /* CONFIG_ACPI */ 2432#endif /* CONFIG_ACPI */
1857 2433
1858static const struct x86_cpu_id hwp_support_ids[] __initconst = { 2434static const struct x86_cpu_id hwp_support_ids[] __initconst = {
@@ -1872,6 +2448,7 @@ static int __init intel_pstate_init(void)
1872 if (x86_match_cpu(hwp_support_ids) && !no_hwp) { 2448 if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
1873 copy_cpu_funcs(&core_params.funcs); 2449 copy_cpu_funcs(&core_params.funcs);
1874 hwp_active++; 2450 hwp_active++;
2451 intel_pstate.attr = hwp_cpufreq_attrs;
1875 goto hwp_cpu_matched; 2452 goto hwp_cpu_matched;
1876 } 2453 }
1877 2454
@@ -1904,7 +2481,9 @@ hwp_cpu_matched:
1904 if (!hwp_active && hwp_only) 2481 if (!hwp_active && hwp_only)
1905 goto out; 2482 goto out;
1906 2483
1907 rc = cpufreq_register_driver(&intel_pstate_driver); 2484 intel_pstate_request_control_from_smm();
2485
2486 rc = cpufreq_register_driver(intel_pstate_driver);
1908 if (rc) 2487 if (rc)
1909 goto out; 2488 goto out;
1910 2489
@@ -1919,7 +2498,9 @@ out:
1919 get_online_cpus(); 2498 get_online_cpus();
1920 for_each_online_cpu(cpu) { 2499 for_each_online_cpu(cpu) {
1921 if (all_cpu_data[cpu]) { 2500 if (all_cpu_data[cpu]) {
1922 intel_pstate_clear_update_util_hook(cpu); 2501 if (intel_pstate_driver == &intel_pstate)
2502 intel_pstate_clear_update_util_hook(cpu);
2503
1923 kfree(all_cpu_data[cpu]); 2504 kfree(all_cpu_data[cpu]);
1924 } 2505 }
1925 } 2506 }
@@ -1935,8 +2516,13 @@ static int __init intel_pstate_setup(char *str)
1935 if (!str) 2516 if (!str)
1936 return -EINVAL; 2517 return -EINVAL;
1937 2518
1938 if (!strcmp(str, "disable")) 2519 if (!strcmp(str, "disable")) {
1939 no_load = 1; 2520 no_load = 1;
2521 } else if (!strcmp(str, "passive")) {
2522 pr_info("Passive mode enabled\n");
2523 intel_pstate_driver = &intel_cpufreq;
2524 no_hwp = 1;
2525 }
1940 if (!strcmp(str, "no_hwp")) { 2526 if (!strcmp(str, "no_hwp")) {
1941 pr_info("HWP disabled\n"); 2527 pr_info("HWP disabled\n");
1942 no_hwp = 1; 2528 no_hwp = 1;
@@ -1945,6 +2531,8 @@ static int __init intel_pstate_setup(char *str)
1945 force_load = 1; 2531 force_load = 1;
1946 if (!strcmp(str, "hwp_only")) 2532 if (!strcmp(str, "hwp_only"))
1947 hwp_only = 1; 2533 hwp_only = 1;
2534 if (!strcmp(str, "per_cpu_perf_limits"))
2535 per_cpu_limits = true;
1948 2536
1949#ifdef CONFIG_ACPI 2537#ifdef CONFIG_ACPI
1950 if (!strcmp(str, "support_acpi_ppc")) 2538 if (!strcmp(str, "support_acpi_ppc"))
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde806629..37671b545880 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,10 @@
42#define PMSR_PSAFE_ENABLE (1UL << 30) 42#define PMSR_PSAFE_ENABLE (1UL << 30)
43#define PMSR_SPR_EM_DISABLE (1UL << 31) 43#define PMSR_SPR_EM_DISABLE (1UL << 31)
44#define PMSR_MAX(x) ((x >> 32) & 0xFF) 44#define PMSR_MAX(x) ((x >> 32) & 0xFF)
45#define LPSTATE_SHIFT 48
46#define GPSTATE_SHIFT 56
47#define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF)
48#define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF)
45 49
46#define MAX_RAMP_DOWN_TIME 5120 50#define MAX_RAMP_DOWN_TIME 5120
47/* 51/*
@@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data)
592{ 596{
593 struct cpufreq_policy *policy = (struct cpufreq_policy *)data; 597 struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
594 struct global_pstate_info *gpstates = policy->driver_data; 598 struct global_pstate_info *gpstates = policy->driver_data;
595 int gpstate_idx; 599 int gpstate_idx, lpstate_idx;
600 unsigned long val;
596 unsigned int time_diff = jiffies_to_msecs(jiffies) 601 unsigned int time_diff = jiffies_to_msecs(jiffies)
597 - gpstates->last_sampled_time; 602 - gpstates->last_sampled_time;
598 struct powernv_smp_call_data freq_data; 603 struct powernv_smp_call_data freq_data;
@@ -600,21 +605,37 @@ void gpstate_timer_handler(unsigned long data)
600 if (!spin_trylock(&gpstates->gpstate_lock)) 605 if (!spin_trylock(&gpstates->gpstate_lock))
601 return; 606 return;
602 607
608 /*
609 * If PMCR was last updated was using fast_swtich then
610 * We may have wrong in gpstate->last_lpstate_idx
611 * value. Hence, read from PMCR to get correct data.
612 */
613 val = get_pmspr(SPRN_PMCR);
614 freq_data.gpstate_id = (s8)GET_GPSTATE(val);
615 freq_data.pstate_id = (s8)GET_LPSTATE(val);
616 if (freq_data.gpstate_id == freq_data.pstate_id) {
617 reset_gpstates(policy);
618 spin_unlock(&gpstates->gpstate_lock);
619 return;
620 }
621
603 gpstates->last_sampled_time += time_diff; 622 gpstates->last_sampled_time += time_diff;
604 gpstates->elapsed_time += time_diff; 623 gpstates->elapsed_time += time_diff;
605 freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
606 624
607 if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) || 625 if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
608 (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
609 gpstate_idx = pstate_to_idx(freq_data.pstate_id); 626 gpstate_idx = pstate_to_idx(freq_data.pstate_id);
627 lpstate_idx = gpstate_idx;
610 reset_gpstates(policy); 628 reset_gpstates(policy);
611 gpstates->highest_lpstate_idx = gpstate_idx; 629 gpstates->highest_lpstate_idx = gpstate_idx;
612 } else { 630 } else {
631 lpstate_idx = pstate_to_idx(freq_data.pstate_id);
613 gpstate_idx = calc_global_pstate(gpstates->elapsed_time, 632 gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
614 gpstates->highest_lpstate_idx, 633 gpstates->highest_lpstate_idx,
615 gpstates->last_lpstate_idx); 634 lpstate_idx);
616 } 635 }
617 636 freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
637 gpstates->last_gpstate_idx = gpstate_idx;
638 gpstates->last_lpstate_idx = lpstate_idx;
618 /* 639 /*
619 * If local pstate is equal to global pstate, rampdown is over 640 * If local pstate is equal to global pstate, rampdown is over
620 * So timer is not required to be queued. 641 * So timer is not required to be queued.
@@ -622,10 +643,6 @@ void gpstate_timer_handler(unsigned long data)
622 if (gpstate_idx != gpstates->last_lpstate_idx) 643 if (gpstate_idx != gpstates->last_lpstate_idx)
623 queue_gpstate_timer(gpstates); 644 queue_gpstate_timer(gpstates);
624 645
625 freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
626 gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
627 gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
628
629 spin_unlock(&gpstates->gpstate_lock); 646 spin_unlock(&gpstates->gpstate_lock);
630 647
631 /* Timer may get migrated to a different cpu on cpu hot unplug */ 648 /* Timer may get migrated to a different cpu on cpu hot unplug */
@@ -647,8 +664,14 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
647 if (unlikely(rebooting) && new_index != get_nominal_index()) 664 if (unlikely(rebooting) && new_index != get_nominal_index())
648 return 0; 665 return 0;
649 666
650 if (!throttled) 667 if (!throttled) {
668 /* we don't want to be preempted while
669 * checking if the CPU frequency has been throttled
670 */
671 preempt_disable();
651 powernv_cpufreq_throttle_check(NULL); 672 powernv_cpufreq_throttle_check(NULL);
673 preempt_enable();
674 }
652 675
653 cur_msec = jiffies_to_msecs(get_jiffies_64()); 676 cur_msec = jiffies_to_msecs(get_jiffies_64());
654 677
@@ -752,9 +775,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
752 spin_lock_init(&gpstates->gpstate_lock); 775 spin_lock_init(&gpstates->gpstate_lock);
753 ret = cpufreq_table_validate_and_show(policy, powernv_freqs); 776 ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
754 777
755 if (ret < 0) 778 if (ret < 0) {
756 kfree(policy->driver_data); 779 kfree(policy->driver_data);
780 return ret;
781 }
757 782
783 policy->fast_switch_possible = true;
758 return ret; 784 return ret;
759} 785}
760 786
@@ -897,6 +923,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
897 del_timer_sync(&gpstates->timer); 923 del_timer_sync(&gpstates->timer);
898} 924}
899 925
926static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
927 unsigned int target_freq)
928{
929 int index;
930 struct powernv_smp_call_data freq_data;
931
932 index = cpufreq_table_find_index_dl(policy, target_freq);
933 freq_data.pstate_id = powernv_freqs[index].driver_data;
934 freq_data.gpstate_id = powernv_freqs[index].driver_data;
935 set_pstate(&freq_data);
936
937 return powernv_freqs[index].frequency;
938}
939
900static struct cpufreq_driver powernv_cpufreq_driver = { 940static struct cpufreq_driver powernv_cpufreq_driver = {
901 .name = "powernv-cpufreq", 941 .name = "powernv-cpufreq",
902 .flags = CPUFREQ_CONST_LOOPS, 942 .flags = CPUFREQ_CONST_LOOPS,
@@ -904,6 +944,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
904 .exit = powernv_cpufreq_cpu_exit, 944 .exit = powernv_cpufreq_cpu_exit,
905 .verify = cpufreq_generic_frequency_table_verify, 945 .verify = cpufreq_generic_frequency_table_verify,
906 .target_index = powernv_cpufreq_target_index, 946 .target_index = powernv_cpufreq_target_index,
947 .fast_switch = powernv_fast_switch,
907 .get = powernv_cpufreq_get, 948 .get = powernv_cpufreq_get,
908 .stop_cpu = powernv_cpufreq_stop_cpu, 949 .stop_cpu = powernv_cpufreq_stop_cpu,
909 .attr = powernv_cpu_freq_attr, 950 .attr = powernv_cpu_freq_attr,
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 7fe442ca38f4..0835a37a5f3a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -22,7 +22,7 @@
22 22
23#define POWERNV_THRESHOLD_LATENCY_NS 200000 23#define POWERNV_THRESHOLD_LATENCY_NS 200000
24 24
25struct cpuidle_driver powernv_idle_driver = { 25static struct cpuidle_driver powernv_idle_driver = {
26 .name = "powernv_idle", 26 .name = "powernv_idle",
27 .owner = THIS_MODULE, 27 .owner = THIS_MODULE,
28}; 28};
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207abb5a4..62810ff3b00f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv,
97 return ret; 97 return ret;
98} 98}
99 99
100#ifdef CONFIG_SUSPEND 100/**
101 * cpuidle_use_deepest_state - Set/clear governor override flag.
102 * @enable: New value of the flag.
103 *
104 * Set/unset the current CPU to use the deepest idle state (override governors
105 * going forward if set).
106 */
107void cpuidle_use_deepest_state(bool enable)
108{
109 struct cpuidle_device *dev;
110
111 preempt_disable();
112 dev = cpuidle_get_device();
113 dev->use_deepest_state = enable;
114 preempt_enable();
115}
116
101/** 117/**
102 * cpuidle_find_deepest_state - Find the deepest available idle state. 118 * cpuidle_find_deepest_state - Find the deepest available idle state.
103 * @drv: cpuidle driver for the given CPU. 119 * @drv: cpuidle driver for the given CPU.
@@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
109 return find_deepest_state(drv, dev, UINT_MAX, 0, false); 125 return find_deepest_state(drv, dev, UINT_MAX, 0, false);
110} 126}
111 127
128#ifdef CONFIG_SUSPEND
112static void enter_freeze_proper(struct cpuidle_driver *drv, 129static void enter_freeze_proper(struct cpuidle_driver *drv,
113 struct cpuidle_device *dev, int index) 130 struct cpuidle_device *dev, int index)
114{ 131{
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index a5c111b67f37..ffca4fc0061d 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state,
38 * state enter function. 38 * state enter function.
39 */ 39 */
40 idle_state->enter = match_id->data; 40 idle_state->enter = match_id->data;
41 /*
42 * Since this is not a "coupled" state, it's safe to assume interrupts
43 * won't be enabled when it exits allowing the tick to be frozen
44 * safely. So enter() can be also enter_freeze() callback.
45 */
46 idle_state->enter_freeze = match_id->data;
41 47
42 err = of_property_read_u32(state_node, "wakeup-latency-us", 48 err = of_property_read_u32(state_node, "wakeup-latency-us",
43 &idle_state->exit_latency); 49 &idle_state->exit_latency);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index fb9f511cca23..4e78263e34a4 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -9,7 +9,6 @@
9 */ 9 */
10 10
11#include <linux/mutex.h> 11#include <linux/mutex.h>
12#include <linux/module.h>
13#include <linux/cpuidle.h> 12#include <linux/cpuidle.h>
14 13
15#include "cpuidle.h" 14#include "cpuidle.h"
@@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
53 if (cpuidle_curr_governor) { 52 if (cpuidle_curr_governor) {
54 list_for_each_entry(dev, &cpuidle_detected_devices, device_list) 53 list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
55 cpuidle_disable_device(dev); 54 cpuidle_disable_device(dev);
56 module_put(cpuidle_curr_governor->owner);
57 } 55 }
58 56
59 cpuidle_curr_governor = gov; 57 cpuidle_curr_governor = gov;
60 58
61 if (gov) { 59 if (gov) {
62 if (!try_module_get(cpuidle_curr_governor->owner))
63 return -EINVAL;
64 list_for_each_entry(dev, &cpuidle_detected_devices, device_list) 60 list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
65 cpuidle_enable_device(dev); 61 cpuidle_enable_device(dev);
66 cpuidle_install_idle_handler(); 62 cpuidle_install_idle_handler();
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 63bd5a403e22..fe8f08948fcb 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -15,7 +15,6 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/cpuidle.h> 16#include <linux/cpuidle.h>
17#include <linux/pm_qos.h> 17#include <linux/pm_qos.h>
18#include <linux/module.h>
19#include <linux/jiffies.h> 18#include <linux/jiffies.h>
20#include <linux/tick.h> 19#include <linux/tick.h>
21 20
@@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = {
177 .enable = ladder_enable_device, 176 .enable = ladder_enable_device,
178 .select = ladder_select_state, 177 .select = ladder_select_state,
179 .reflect = ladder_reflect, 178 .reflect = ladder_reflect,
180 .owner = THIS_MODULE,
181}; 179};
182 180
183/** 181/**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c291de6..d9b5b9398a0f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,6 @@
19#include <linux/tick.h> 19#include <linux/tick.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h> 21#include <linux/math64.h>
22#include <linux/module.h>
23 22
24/* 23/*
25 * Please note when changing the tuning values: 24 * Please note when changing the tuning values:
@@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = {
484 .enable = menu_enable_device, 483 .enable = menu_enable_device,
485 .select = menu_select, 484 .select = menu_select,
486 .reflect = menu_reflect, 485 .reflect = menu_reflect,
487 .owner = THIS_MODULE,
488}; 486};
489 487
490/** 488/**
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 832a2c3f01ff..c5adc8c9ac43 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
403 /* state statistics */ 403 /* state statistics */
404 for (i = 0; i < drv->state_count; i++) { 404 for (i = 0; i < drv->state_count; i++) {
405 kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); 405 kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
406 if (!kobj) 406 if (!kobj) {
407 ret = -ENOMEM;
407 goto error_state; 408 goto error_state;
409 }
408 kobj->state = &drv->states[i]; 410 kobj->state = &drv->states[i];
409 kobj->state_usage = &device->states_usage[i]; 411 kobj->state_usage = &device->states_usage[i];
410 init_completion(&kobj->kobj_unregister); 412 init_completion(&kobj->kobj_unregister);
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3ea7603a58..a324801d6a66 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -850,7 +850,7 @@ err_out:
850EXPORT_SYMBOL(devfreq_add_governor); 850EXPORT_SYMBOL(devfreq_add_governor);
851 851
852/** 852/**
853 * devfreq_remove_device() - Remove devfreq feature from a device. 853 * devfreq_remove_governor() - Remove devfreq feature from a device.
854 * @governor: the devfreq governor to be removed 854 * @governor: the devfreq governor to be removed
855 */ 855 */
856int devfreq_remove_governor(struct devfreq_governor *governor) 856int devfreq_remove_governor(struct devfreq_governor *governor)
diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c
index 49e712aca0c1..5c3e7b11e8a6 100644
--- a/drivers/devfreq/event/exynos-nocp.c
+++ b/drivers/devfreq/event/exynos-nocp.c
@@ -190,6 +190,7 @@ static const struct of_device_id exynos_nocp_id_match[] = {
190 { .compatible = "samsung,exynos5420-nocp", }, 190 { .compatible = "samsung,exynos5420-nocp", },
191 { /* sentinel */ }, 191 { /* sentinel */ },
192}; 192};
193MODULE_DEVICE_TABLE(of, exynos_nocp_id_match);
193 194
194static struct regmap_config exynos_nocp_regmap_config = { 195static struct regmap_config exynos_nocp_regmap_config = {
195 .reg_bits = 32, 196 .reg_bits = 32,
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index f55cf0eb2a66..107eb91a9415 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -15,7 +15,6 @@
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/mutex.h>
19#include <linux/of_address.h> 18#include <linux/of_address.h>
20#include <linux/platform_device.h> 19#include <linux/platform_device.h>
21#include <linux/suspend.h> 20#include <linux/suspend.h>
@@ -34,7 +33,6 @@ struct exynos_ppmu {
34 unsigned int num_events; 33 unsigned int num_events;
35 34
36 struct device *dev; 35 struct device *dev;
37 struct mutex lock;
38 36
39 struct exynos_ppmu_data ppmu; 37 struct exynos_ppmu_data ppmu;
40}; 38};
@@ -90,8 +88,6 @@ struct __exynos_ppmu_events {
90 PPMU_EVENT(d1-cpu), 88 PPMU_EVENT(d1-cpu),
91 PPMU_EVENT(d1-general), 89 PPMU_EVENT(d1-general),
92 PPMU_EVENT(d1-rt), 90 PPMU_EVENT(d1-rt),
93
94 { /* sentinel */ },
95}; 91};
96 92
97static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev) 93static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
@@ -351,6 +347,7 @@ static const struct of_device_id exynos_ppmu_id_match[] = {
351 }, 347 },
352 { /* sentinel */ }, 348 { /* sentinel */ },
353}; 349};
350MODULE_DEVICE_TABLE(of, exynos_ppmu_id_match);
354 351
355static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np) 352static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np)
356{ 353{
@@ -463,7 +460,6 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
463 if (!info) 460 if (!info)
464 return -ENOMEM; 461 return -ENOMEM;
465 462
466 mutex_init(&info->lock);
467 info->dev = &pdev->dev; 463 info->dev = &pdev->dev;
468 464
469 /* Parse dt data to get resource */ 465 /* Parse dt data to get resource */
diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c
index 43fcc5a7f515..22b113363ffc 100644
--- a/drivers/devfreq/event/rockchip-dfi.c
+++ b/drivers/devfreq/event/rockchip-dfi.c
@@ -188,6 +188,7 @@ static const struct of_device_id rockchip_dfi_id_match[] = {
188 { .compatible = "rockchip,rk3399-dfi" }, 188 { .compatible = "rockchip,rk3399-dfi" },
189 { }, 189 { },
190}; 190};
191MODULE_DEVICE_TABLE(of, rockchip_dfi_id_match);
191 192
192static int rockchip_dfi_probe(struct platform_device *pdev) 193static int rockchip_dfi_probe(struct platform_device *pdev)
193{ 194{
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 29866f7e6d7e..a8ed7792ece2 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -35,7 +35,7 @@ struct exynos_bus {
35 unsigned int edev_count; 35 unsigned int edev_count;
36 struct mutex lock; 36 struct mutex lock;
37 37
38 struct dev_pm_opp *curr_opp; 38 unsigned long curr_freq;
39 39
40 struct regulator *regulator; 40 struct regulator *regulator;
41 struct clk *clk; 41 struct clk *clk;
@@ -99,7 +99,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
99{ 99{
100 struct exynos_bus *bus = dev_get_drvdata(dev); 100 struct exynos_bus *bus = dev_get_drvdata(dev);
101 struct dev_pm_opp *new_opp; 101 struct dev_pm_opp *new_opp;
102 unsigned long old_freq, new_freq, old_volt, new_volt, tol; 102 unsigned long old_freq, new_freq, new_volt, tol;
103 int ret = 0; 103 int ret = 0;
104 104
105 /* Get new opp-bus instance according to new bus clock */ 105 /* Get new opp-bus instance according to new bus clock */
@@ -113,8 +113,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
113 113
114 new_freq = dev_pm_opp_get_freq(new_opp); 114 new_freq = dev_pm_opp_get_freq(new_opp);
115 new_volt = dev_pm_opp_get_voltage(new_opp); 115 new_volt = dev_pm_opp_get_voltage(new_opp);
116 old_freq = dev_pm_opp_get_freq(bus->curr_opp); 116 old_freq = bus->curr_freq;
117 old_volt = dev_pm_opp_get_voltage(bus->curr_opp);
118 rcu_read_unlock(); 117 rcu_read_unlock();
119 118
120 if (old_freq == new_freq) 119 if (old_freq == new_freq)
@@ -146,7 +145,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
146 goto out; 145 goto out;
147 } 146 }
148 } 147 }
149 bus->curr_opp = new_opp; 148 bus->curr_freq = new_freq;
150 149
151 dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n", 150 dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
152 old_freq/1000, new_freq/1000); 151 old_freq/1000, new_freq/1000);
@@ -163,9 +162,7 @@ static int exynos_bus_get_dev_status(struct device *dev,
163 struct devfreq_event_data edata; 162 struct devfreq_event_data edata;
164 int ret; 163 int ret;
165 164
166 rcu_read_lock(); 165 stat->current_frequency = bus->curr_freq;
167 stat->current_frequency = dev_pm_opp_get_freq(bus->curr_opp);
168 rcu_read_unlock();
169 166
170 ret = exynos_bus_get_event(bus, &edata); 167 ret = exynos_bus_get_event(bus, &edata);
171 if (ret < 0) { 168 if (ret < 0) {
@@ -226,7 +223,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
226 } 223 }
227 224
228 new_freq = dev_pm_opp_get_freq(new_opp); 225 new_freq = dev_pm_opp_get_freq(new_opp);
229 old_freq = dev_pm_opp_get_freq(bus->curr_opp); 226 old_freq = bus->curr_freq;
230 rcu_read_unlock(); 227 rcu_read_unlock();
231 228
232 if (old_freq == new_freq) 229 if (old_freq == new_freq)
@@ -242,7 +239,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
242 } 239 }
243 240
244 *freq = new_freq; 241 *freq = new_freq;
245 bus->curr_opp = new_opp; 242 bus->curr_freq = new_freq;
246 243
247 dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n", 244 dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
248 old_freq/1000, new_freq/1000); 245 old_freq/1000, new_freq/1000);
@@ -335,6 +332,7 @@ static int exynos_bus_parse_of(struct device_node *np,
335 struct exynos_bus *bus) 332 struct exynos_bus *bus)
336{ 333{
337 struct device *dev = bus->dev; 334 struct device *dev = bus->dev;
335 struct dev_pm_opp *opp;
338 unsigned long rate; 336 unsigned long rate;
339 int ret; 337 int ret;
340 338
@@ -352,22 +350,23 @@ static int exynos_bus_parse_of(struct device_node *np,
352 } 350 }
353 351
354 /* Get the freq and voltage from OPP table to scale the bus freq */ 352 /* Get the freq and voltage from OPP table to scale the bus freq */
355 rcu_read_lock();
356 ret = dev_pm_opp_of_add_table(dev); 353 ret = dev_pm_opp_of_add_table(dev);
357 if (ret < 0) { 354 if (ret < 0) {
358 dev_err(dev, "failed to get OPP table\n"); 355 dev_err(dev, "failed to get OPP table\n");
359 rcu_read_unlock();
360 goto err_clk; 356 goto err_clk;
361 } 357 }
362 358
363 rate = clk_get_rate(bus->clk); 359 rate = clk_get_rate(bus->clk);
364 bus->curr_opp = devfreq_recommended_opp(dev, &rate, 0); 360
365 if (IS_ERR(bus->curr_opp)) { 361 rcu_read_lock();
362 opp = devfreq_recommended_opp(dev, &rate, 0);
363 if (IS_ERR(opp)) {
366 dev_err(dev, "failed to find dev_pm_opp\n"); 364 dev_err(dev, "failed to find dev_pm_opp\n");
367 rcu_read_unlock(); 365 rcu_read_unlock();
368 ret = PTR_ERR(bus->curr_opp); 366 ret = PTR_ERR(opp);
369 goto err_opp; 367 goto err_opp;
370 } 368 }
369 bus->curr_freq = dev_pm_opp_get_freq(opp);
371 rcu_read_unlock(); 370 rcu_read_unlock();
372 371
373 return 0; 372 return 0;
diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index e24b73d66659..27d2f349b53c 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -80,7 +80,6 @@ struct rk3399_dmcfreq {
80 struct regulator *vdd_center; 80 struct regulator *vdd_center;
81 unsigned long rate, target_rate; 81 unsigned long rate, target_rate;
82 unsigned long volt, target_volt; 82 unsigned long volt, target_volt;
83 struct dev_pm_opp *curr_opp;
84}; 83};
85 84
86static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, 85static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
@@ -102,9 +101,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
102 target_rate = dev_pm_opp_get_freq(opp); 101 target_rate = dev_pm_opp_get_freq(opp);
103 target_volt = dev_pm_opp_get_voltage(opp); 102 target_volt = dev_pm_opp_get_voltage(opp);
104 103
105 dmcfreq->rate = dev_pm_opp_get_freq(dmcfreq->curr_opp);
106 dmcfreq->volt = dev_pm_opp_get_voltage(dmcfreq->curr_opp);
107
108 rcu_read_unlock(); 104 rcu_read_unlock();
109 105
110 if (dmcfreq->rate == target_rate) 106 if (dmcfreq->rate == target_rate)
@@ -165,7 +161,9 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
165 if (err) 161 if (err)
166 dev_err(dev, "Cannot to set vol %lu uV\n", target_volt); 162 dev_err(dev, "Cannot to set vol %lu uV\n", target_volt);
167 163
168 dmcfreq->curr_opp = opp; 164 dmcfreq->rate = target_rate;
165 dmcfreq->volt = target_volt;
166
169out: 167out:
170 mutex_unlock(&dmcfreq->lock); 168 mutex_unlock(&dmcfreq->lock);
171 return err; 169 return err;
@@ -414,7 +412,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
414 */ 412 */
415 if (dev_pm_opp_of_add_table(dev)) { 413 if (dev_pm_opp_of_add_table(dev)) {
416 dev_err(dev, "Invalid operating-points in device tree.\n"); 414 dev_err(dev, "Invalid operating-points in device tree.\n");
417 rcu_read_unlock();
418 return -EINVAL; 415 return -EINVAL;
419 } 416 }
420 417
@@ -431,12 +428,13 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
431 rcu_read_unlock(); 428 rcu_read_unlock();
432 return PTR_ERR(opp); 429 return PTR_ERR(opp);
433 } 430 }
431 data->rate = dev_pm_opp_get_freq(opp);
432 data->volt = dev_pm_opp_get_voltage(opp);
434 rcu_read_unlock(); 433 rcu_read_unlock();
435 data->curr_opp = opp;
436 434
437 rk3399_devfreq_dmc_profile.initial_freq = data->rate; 435 rk3399_devfreq_dmc_profile.initial_freq = data->rate;
438 436
439 data->devfreq = devfreq_add_device(dev, 437 data->devfreq = devm_devfreq_add_device(dev,
440 &rk3399_devfreq_dmc_profile, 438 &rk3399_devfreq_dmc_profile,
441 "simple_ondemand", 439 "simple_ondemand",
442 &data->ondemand_data); 440 &data->ondemand_data);
@@ -454,6 +452,7 @@ static const struct of_device_id rk3399dmc_devfreq_of_match[] = {
454 { .compatible = "rockchip,rk3399-dmc" }, 452 { .compatible = "rockchip,rk3399-dmc" },
455 { }, 453 { },
456}; 454};
455MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match);
457 456
458static struct platform_driver rk3399_dmcfreq_driver = { 457static struct platform_driver rk3399_dmcfreq_driver = {
459 .probe = rk3399_dmcfreq_probe, 458 .probe = rk3399_dmcfreq_probe,
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4466a2f969d7..7d8ea3d5fda6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev,
98 struct cpuidle_driver *drv, int index); 98 struct cpuidle_driver *drv, int index);
99static void intel_idle_freeze(struct cpuidle_device *dev, 99static void intel_idle_freeze(struct cpuidle_device *dev,
100 struct cpuidle_driver *drv, int index); 100 struct cpuidle_driver *drv, int index);
101static int intel_idle_cpu_init(int cpu);
102
103static struct cpuidle_state *cpuidle_state_table; 101static struct cpuidle_state *cpuidle_state_table;
104 102
105/* 103/*
@@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = {
724 { 722 {
725 .enter = NULL } 723 .enter = NULL }
726}; 724};
725static struct cpuidle_state tangier_cstates[] = {
726 {
727 .name = "C1-TNG",
728 .desc = "MWAIT 0x00",
729 .flags = MWAIT2flg(0x00),
730 .exit_latency = 1,
731 .target_residency = 4,
732 .enter = &intel_idle,
733 .enter_freeze = intel_idle_freeze, },
734 {
735 .name = "C4-TNG",
736 .desc = "MWAIT 0x30",
737 .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
738 .exit_latency = 100,
739 .target_residency = 400,
740 .enter = &intel_idle,
741 .enter_freeze = intel_idle_freeze, },
742 {
743 .name = "C6-TNG",
744 .desc = "MWAIT 0x52",
745 .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
746 .exit_latency = 140,
747 .target_residency = 560,
748 .enter = &intel_idle,
749 .enter_freeze = intel_idle_freeze, },
750 {
751 .name = "C7-TNG",
752 .desc = "MWAIT 0x60",
753 .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
754 .exit_latency = 1200,
755 .target_residency = 4000,
756 .enter = &intel_idle,
757 .enter_freeze = intel_idle_freeze, },
758 {
759 .name = "C9-TNG",
760 .desc = "MWAIT 0x64",
761 .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
762 .exit_latency = 10000,
763 .target_residency = 20000,
764 .enter = &intel_idle,
765 .enter_freeze = intel_idle_freeze, },
766 {
767 .enter = NULL }
768};
727static struct cpuidle_state avn_cstates[] = { 769static struct cpuidle_state avn_cstates[] = {
728 { 770 {
729 .name = "C1-AVN", 771 .name = "C1-AVN",
@@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev,
907 mwait_idle_with_hints(eax, ecx); 949 mwait_idle_with_hints(eax, ecx);
908} 950}
909 951
910static void __setup_broadcast_timer(void *arg) 952static void __setup_broadcast_timer(bool on)
911{ 953{
912 unsigned long on = (unsigned long)arg;
913
914 if (on) 954 if (on)
915 tick_broadcast_enable(); 955 tick_broadcast_enable();
916 else 956 else
917 tick_broadcast_disable(); 957 tick_broadcast_disable();
918} 958}
919 959
920static int cpu_hotplug_notify(struct notifier_block *n, 960static void auto_demotion_disable(void)
921 unsigned long action, void *hcpu)
922{
923 int hotcpu = (unsigned long)hcpu;
924 struct cpuidle_device *dev;
925
926 switch (action & ~CPU_TASKS_FROZEN) {
927 case CPU_ONLINE:
928
929 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
930 smp_call_function_single(hotcpu, __setup_broadcast_timer,
931 (void *)true, 1);
932
933 /*
934 * Some systems can hotplug a cpu at runtime after
935 * the kernel has booted, we have to initialize the
936 * driver in this case
937 */
938 dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu);
939 if (dev->registered)
940 break;
941
942 if (intel_idle_cpu_init(hotcpu))
943 return NOTIFY_BAD;
944
945 break;
946 }
947 return NOTIFY_OK;
948}
949
950static struct notifier_block cpu_hotplug_notifier = {
951 .notifier_call = cpu_hotplug_notify,
952};
953
954static void auto_demotion_disable(void *dummy)
955{ 961{
956 unsigned long long msr_bits; 962 unsigned long long msr_bits;
957 963
@@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy)
959 msr_bits &= ~(icpu->auto_demotion_disable_flags); 965 msr_bits &= ~(icpu->auto_demotion_disable_flags);
960 wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); 966 wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
961} 967}
962static void c1e_promotion_disable(void *dummy) 968static void c1e_promotion_disable(void)
963{ 969{
964 unsigned long long msr_bits; 970 unsigned long long msr_bits;
965 971
@@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = {
978 .state_table = atom_cstates, 984 .state_table = atom_cstates,
979}; 985};
980 986
987static const struct idle_cpu idle_cpu_tangier = {
988 .state_table = tangier_cstates,
989};
990
981static const struct idle_cpu idle_cpu_lincroft = { 991static const struct idle_cpu idle_cpu_lincroft = {
982 .state_table = atom_cstates, 992 .state_table = atom_cstates,
983 .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, 993 .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
1066 ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), 1076 ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
1067 ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), 1077 ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
1068 ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), 1078 ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
1079 ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
1069 ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), 1080 ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
1070 ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), 1081 ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
1071 ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), 1082 ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
@@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
1084 ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), 1095 ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl),
1085 ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), 1096 ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
1086 ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), 1097 ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
1098 ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl),
1087 ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), 1099 ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
1088 ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), 1100 ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
1089 {} 1101 {}
@@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
1373 * allocate, initialize, register cpuidle_devices 1385 * allocate, initialize, register cpuidle_devices
1374 * @cpu: cpu/core to initialize 1386 * @cpu: cpu/core to initialize
1375 */ 1387 */
1376static int intel_idle_cpu_init(int cpu) 1388static int intel_idle_cpu_init(unsigned int cpu)
1377{ 1389{
1378 struct cpuidle_device *dev; 1390 struct cpuidle_device *dev;
1379 1391
1380 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); 1392 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1381
1382 dev->cpu = cpu; 1393 dev->cpu = cpu;
1383 1394
1384 if (cpuidle_register_device(dev)) { 1395 if (cpuidle_register_device(dev)) {
@@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu)
1387 } 1398 }
1388 1399
1389 if (icpu->auto_demotion_disable_flags) 1400 if (icpu->auto_demotion_disable_flags)
1390 smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); 1401 auto_demotion_disable();
1391 1402
1392 if (icpu->disable_promotion_to_c1e) 1403 if (icpu->disable_promotion_to_c1e)
1393 smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); 1404 c1e_promotion_disable();
1405
1406 return 0;
1407}
1408
1409static int intel_idle_cpu_online(unsigned int cpu)
1410{
1411 struct cpuidle_device *dev;
1412
1413 if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
1414 __setup_broadcast_timer(true);
1415
1416 /*
1417 * Some systems can hotplug a cpu at runtime after
1418 * the kernel has booted, we have to initialize the
1419 * driver in this case
1420 */
1421 dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1422 if (!dev->registered)
1423 return intel_idle_cpu_init(cpu);
1394 1424
1395 return 0; 1425 return 0;
1396} 1426}
1397 1427
1398static int __init intel_idle_init(void) 1428static int __init intel_idle_init(void)
1399{ 1429{
1400 int retval, i; 1430 int retval;
1401 1431
1402 /* Do not load intel_idle at all for now if idle= is passed */ 1432 /* Do not load intel_idle at all for now if idle= is passed */
1403 if (boot_option_idle_override != IDLE_NO_OVERRIDE) 1433 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
@@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void)
1417 struct cpuidle_driver *drv = cpuidle_get_driver(); 1447 struct cpuidle_driver *drv = cpuidle_get_driver();
1418 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 1448 printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
1419 drv ? drv->name : "none"); 1449 drv ? drv->name : "none");
1420 free_percpu(intel_idle_cpuidle_devices); 1450 goto init_driver_fail;
1421 return retval;
1422 } 1451 }
1423 1452
1424 cpu_notifier_register_begin();
1425
1426 for_each_online_cpu(i) {
1427 retval = intel_idle_cpu_init(i);
1428 if (retval) {
1429 intel_idle_cpuidle_devices_uninit();
1430 cpu_notifier_register_done();
1431 cpuidle_unregister_driver(&intel_idle_driver);
1432 free_percpu(intel_idle_cpuidle_devices);
1433 return retval;
1434 }
1435 }
1436 __register_cpu_notifier(&cpu_hotplug_notifier);
1437
1438 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 1453 if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
1439 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; 1454 lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
1440 else
1441 on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
1442 1455
1443 cpu_notifier_register_done(); 1456 retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
1457 intel_idle_cpu_online, NULL);
1458 if (retval < 0)
1459 goto hp_setup_fail;
1444 1460
1445 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 1461 pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
1446 lapic_timer_reliable_states); 1462 lapic_timer_reliable_states);
1447 1463
1448 return 0; 1464 return 0;
1465
1466hp_setup_fail:
1467 intel_idle_cpuidle_devices_uninit();
1468 cpuidle_unregister_driver(&intel_idle_driver);
1469init_driver_fail:
1470 free_percpu(intel_idle_cpuidle_devices);
1471 return retval;
1472
1449} 1473}
1450device_initcall(intel_idle_init); 1474device_initcall(intel_idle_init);
1451 1475
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index c48fc0c4abd9..fa5ca0992be6 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2585,6 +2585,9 @@ static int smsc911x_suspend(struct device *dev)
2585 PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ | 2585 PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ |
2586 PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_); 2586 PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_);
2587 2587
2588 pm_runtime_disable(dev);
2589 pm_runtime_set_suspended(dev);
2590
2588 return 0; 2591 return 0;
2589} 2592}
2590 2593
@@ -2594,6 +2597,9 @@ static int smsc911x_resume(struct device *dev)
2594 struct smsc911x_data *pdata = netdev_priv(ndev); 2597 struct smsc911x_data *pdata = netdev_priv(ndev);
2595 unsigned int to = 100; 2598 unsigned int to = 100;
2596 2599
2600 pm_runtime_enable(dev);
2601 pm_runtime_resume(dev);
2602
2597 /* Note 3.11 from the datasheet: 2603 /* Note 3.11 from the datasheet:
2598 * "When the LAN9220 is in a power saving state, a write of any 2604 * "When the LAN9220 is in a power saving state, a write of any
2599 * data to the BYTE_TEST register will wake-up the device." 2605 * data to the BYTE_TEST register will wake-up the device."
diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c
index 01b6d3f9b8fb..56bce1908be2 100644
--- a/drivers/power/avs/rockchip-io-domain.c
+++ b/drivers/power/avs/rockchip-io-domain.c
@@ -143,7 +143,7 @@ static int rockchip_iodomain_notify(struct notifier_block *nb,
143 if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE) 143 if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE)
144 return NOTIFY_BAD; 144 return NOTIFY_BAD;
145 145
146 dev_info(supply->iod->dev, "Setting to %d done\n", uV); 146 dev_dbg(supply->iod->dev, "Setting to %d done\n", uV);
147 return NOTIFY_OK; 147 return NOTIFY_OK;
148} 148}
149 149
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 243b233ff31b..9a25110c4a46 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -189,14 +189,13 @@ struct rapl_package {
189 unsigned int time_unit; 189 unsigned int time_unit;
190 struct rapl_domain *domains; /* array of domains, sized at runtime */ 190 struct rapl_domain *domains; /* array of domains, sized at runtime */
191 struct powercap_zone *power_zone; /* keep track of parent zone */ 191 struct powercap_zone *power_zone; /* keep track of parent zone */
192 int nr_cpus; /* active cpus on the package, topology info is lost during
193 * cpu hotplug. so we have to track ourselves.
194 */
195 unsigned long power_limit_irq; /* keep track of package power limit 192 unsigned long power_limit_irq; /* keep track of package power limit
196 * notify interrupt enable status. 193 * notify interrupt enable status.
197 */ 194 */
198 struct list_head plist; 195 struct list_head plist;
199 int lead_cpu; /* one active cpu per package for access */ 196 int lead_cpu; /* one active cpu per package for access */
197 /* Track active cpus */
198 struct cpumask cpumask;
200}; 199};
201 200
202struct rapl_defaults { 201struct rapl_defaults {
@@ -275,18 +274,6 @@ static struct rapl_package *find_package_by_id(int id)
275 return NULL; 274 return NULL;
276} 275}
277 276
278/* caller must hold cpu hotplug lock */
279static void rapl_cleanup_data(void)
280{
281 struct rapl_package *p, *tmp;
282
283 list_for_each_entry_safe(p, tmp, &rapl_packages, plist) {
284 kfree(p->domains);
285 list_del(&p->plist);
286 kfree(p);
287 }
288}
289
290static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw) 277static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
291{ 278{
292 struct rapl_domain *rd; 279 struct rapl_domain *rd;
@@ -442,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid)
442 return i; 429 return i;
443 } 430 }
444 } 431 }
432 pr_err("Cannot find matching power limit for constraint %d\n", cid);
445 433
446 return -EINVAL; 434 return -EINVAL;
447} 435}
@@ -457,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid,
457 get_online_cpus(); 445 get_online_cpus();
458 rd = power_zone_to_rapl_domain(power_zone); 446 rd = power_zone_to_rapl_domain(power_zone);
459 id = contraint_to_pl(rd, cid); 447 id = contraint_to_pl(rd, cid);
448 if (id < 0) {
449 ret = id;
450 goto set_exit;
451 }
460 452
461 rp = rd->rp; 453 rp = rd->rp;
462 454
@@ -496,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
496 get_online_cpus(); 488 get_online_cpus();
497 rd = power_zone_to_rapl_domain(power_zone); 489 rd = power_zone_to_rapl_domain(power_zone);
498 id = contraint_to_pl(rd, cid); 490 id = contraint_to_pl(rd, cid);
491 if (id < 0) {
492 ret = id;
493 goto get_exit;
494 }
495
499 switch (rd->rpl[id].prim_id) { 496 switch (rd->rpl[id].prim_id) {
500 case PL1_ENABLE: 497 case PL1_ENABLE:
501 prim = POWER_LIMIT1; 498 prim = POWER_LIMIT1;
@@ -512,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
512 else 509 else
513 *data = val; 510 *data = val;
514 511
512get_exit:
515 put_online_cpus(); 513 put_online_cpus();
516 514
517 return ret; 515 return ret;
@@ -527,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
527 get_online_cpus(); 525 get_online_cpus();
528 rd = power_zone_to_rapl_domain(power_zone); 526 rd = power_zone_to_rapl_domain(power_zone);
529 id = contraint_to_pl(rd, cid); 527 id = contraint_to_pl(rd, cid);
528 if (id < 0) {
529 ret = id;
530 goto set_time_exit;
531 }
530 532
531 switch (rd->rpl[id].prim_id) { 533 switch (rd->rpl[id].prim_id) {
532 case PL1_ENABLE: 534 case PL1_ENABLE:
@@ -538,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
538 default: 540 default:
539 ret = -EINVAL; 541 ret = -EINVAL;
540 } 542 }
543
544set_time_exit:
541 put_online_cpus(); 545 put_online_cpus();
542 return ret; 546 return ret;
543} 547}
@@ -552,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
552 get_online_cpus(); 556 get_online_cpus();
553 rd = power_zone_to_rapl_domain(power_zone); 557 rd = power_zone_to_rapl_domain(power_zone);
554 id = contraint_to_pl(rd, cid); 558 id = contraint_to_pl(rd, cid);
559 if (id < 0) {
560 ret = id;
561 goto get_time_exit;
562 }
555 563
556 switch (rd->rpl[id].prim_id) { 564 switch (rd->rpl[id].prim_id) {
557 case PL1_ENABLE: 565 case PL1_ENABLE:
@@ -566,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
566 } 574 }
567 if (!ret) 575 if (!ret)
568 *data = val; 576 *data = val;
577
578get_time_exit:
569 put_online_cpus(); 579 put_online_cpus();
570 580
571 return ret; 581 return ret;
@@ -707,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
707 case ENERGY_UNIT: 717 case ENERGY_UNIT:
708 scale = ENERGY_UNIT_SCALE; 718 scale = ENERGY_UNIT_SCALE;
709 /* per domain unit takes precedence */ 719 /* per domain unit takes precedence */
710 if (rd && rd->domain_energy_unit) 720 if (rd->domain_energy_unit)
711 units = rd->domain_energy_unit; 721 units = rd->domain_energy_unit;
712 else 722 else
713 units = rp->energy_unit; 723 units = rp->energy_unit;
@@ -976,10 +986,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp)
976 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 986 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
977} 987}
978 988
979static void power_limit_irq_restore_cpu(void *info) 989/*
990 * Restore per package power limit interrupt enable state. Called from cpu
991 * hotplug code on package removal.
992 */
993static void package_power_limit_irq_restore(struct rapl_package *rp)
980{ 994{
981 u32 l, h = 0; 995 u32 l, h;
982 struct rapl_package *rp = (struct rapl_package *)info; 996
997 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
998 return;
999
1000 /* irq enable state not saved, nothing to restore */
1001 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
1002 return;
983 1003
984 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 1004 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
985 1005
@@ -991,19 +1011,6 @@ static void power_limit_irq_restore_cpu(void *info)
991 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 1011 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
992} 1012}
993 1013
994/* restore per package power limit interrupt enable state */
995static void package_power_limit_irq_restore(struct rapl_package *rp)
996{
997 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
998 return;
999
1000 /* irq enable state not saved, nothing to restore */
1001 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
1002 return;
1003
1004 smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1);
1005}
1006
1007static void set_floor_freq_default(struct rapl_domain *rd, bool mode) 1014static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
1008{ 1015{
1009 int nr_powerlimit = find_nr_power_limit(rd); 1016 int nr_powerlimit = find_nr_power_limit(rd);
@@ -1160,84 +1167,49 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
1160 RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), 1167 RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core),
1161 1168
1162 RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), 1169 RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server),
1170 RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server),
1163 {} 1171 {}
1164}; 1172};
1165MODULE_DEVICE_TABLE(x86cpu, rapl_ids); 1173MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
1166 1174
1167/* read once for all raw primitive data for all packages, domains */ 1175/* Read once for all raw primitive data for domains */
1168static void rapl_update_domain_data(void) 1176static void rapl_update_domain_data(struct rapl_package *rp)
1169{ 1177{
1170 int dmn, prim; 1178 int dmn, prim;
1171 u64 val; 1179 u64 val;
1172 struct rapl_package *rp;
1173 1180
1174 list_for_each_entry(rp, &rapl_packages, plist) { 1181 for (dmn = 0; dmn < rp->nr_domains; dmn++) {
1175 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 1182 pr_debug("update package %d domain %s data\n", rp->id,
1176 pr_debug("update package %d domain %s data\n", rp->id, 1183 rp->domains[dmn].name);
1177 rp->domains[dmn].name); 1184 /* exclude non-raw primitives */
1178 /* exclude non-raw primitives */ 1185 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
1179 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) 1186 if (!rapl_read_data_raw(&rp->domains[dmn], prim,
1180 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1187 rpi[prim].unit, &val))
1181 rpi[prim].unit, 1188 rp->domains[dmn].rdd.primitives[prim] = val;
1182 &val))
1183 rp->domains[dmn].rdd.primitives[prim] =
1184 val;
1185 } 1189 }
1186 } 1190 }
1187 1191
1188} 1192}
1189 1193
1190static int rapl_unregister_powercap(void) 1194static void rapl_unregister_powercap(void)
1191{ 1195{
1192 struct rapl_package *rp;
1193 struct rapl_domain *rd, *rd_package = NULL;
1194
1195 /* unregister all active rapl packages from the powercap layer,
1196 * hotplug lock held
1197 */
1198 list_for_each_entry(rp, &rapl_packages, plist) {
1199 package_power_limit_irq_restore(rp);
1200
1201 for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
1202 rd++) {
1203 pr_debug("remove package, undo power limit on %d: %s\n",
1204 rp->id, rd->name);
1205 rapl_write_data_raw(rd, PL1_ENABLE, 0);
1206 rapl_write_data_raw(rd, PL1_CLAMP, 0);
1207 if (find_nr_power_limit(rd) > 1) {
1208 rapl_write_data_raw(rd, PL2_ENABLE, 0);
1209 rapl_write_data_raw(rd, PL2_CLAMP, 0);
1210 }
1211 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1212 rd_package = rd;
1213 continue;
1214 }
1215 powercap_unregister_zone(control_type, &rd->power_zone);
1216 }
1217 /* do the package zone last */
1218 if (rd_package)
1219 powercap_unregister_zone(control_type,
1220 &rd_package->power_zone);
1221 }
1222
1223 if (platform_rapl_domain) { 1196 if (platform_rapl_domain) {
1224 powercap_unregister_zone(control_type, 1197 powercap_unregister_zone(control_type,
1225 &platform_rapl_domain->power_zone); 1198 &platform_rapl_domain->power_zone);
1226 kfree(platform_rapl_domain); 1199 kfree(platform_rapl_domain);
1227 } 1200 }
1228
1229 powercap_unregister_control_type(control_type); 1201 powercap_unregister_control_type(control_type);
1230
1231 return 0;
1232} 1202}
1233 1203
1234static int rapl_package_register_powercap(struct rapl_package *rp) 1204static int rapl_package_register_powercap(struct rapl_package *rp)
1235{ 1205{
1236 struct rapl_domain *rd; 1206 struct rapl_domain *rd;
1237 int ret = 0;
1238 char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/ 1207 char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
1239 struct powercap_zone *power_zone = NULL; 1208 struct powercap_zone *power_zone = NULL;
1240 int nr_pl; 1209 int nr_pl, ret;;
1210
1211 /* Update the domain data of the new package */
1212 rapl_update_domain_data(rp);
1241 1213
1242 /* first we register package domain as the parent zone*/ 1214 /* first we register package domain as the parent zone*/
1243 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1215 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1257,8 +1229,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
1257 if (IS_ERR(power_zone)) { 1229 if (IS_ERR(power_zone)) {
1258 pr_debug("failed to register package, %d\n", 1230 pr_debug("failed to register package, %d\n",
1259 rp->id); 1231 rp->id);
1260 ret = PTR_ERR(power_zone); 1232 return PTR_ERR(power_zone);
1261 goto exit_package;
1262 } 1233 }
1263 /* track parent zone in per package/socket data */ 1234 /* track parent zone in per package/socket data */
1264 rp->power_zone = power_zone; 1235 rp->power_zone = power_zone;
@@ -1268,8 +1239,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
1268 } 1239 }
1269 if (!power_zone) { 1240 if (!power_zone) {
1270 pr_err("no package domain found, unknown topology!\n"); 1241 pr_err("no package domain found, unknown topology!\n");
1271 ret = -ENODEV; 1242 return -ENODEV;
1272 goto exit_package;
1273 } 1243 }
1274 /* now register domains as children of the socket/package*/ 1244 /* now register domains as children of the socket/package*/
1275 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1245 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1290,11 +1260,11 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
1290 goto err_cleanup; 1260 goto err_cleanup;
1291 } 1261 }
1292 } 1262 }
1263 return 0;
1293 1264
1294exit_package:
1295 return ret;
1296err_cleanup: 1265err_cleanup:
1297 /* clean up previously initialized domains within the package if we 1266 /*
1267 * Clean up previously initialized domains within the package if we
1298 * failed after the first domain setup. 1268 * failed after the first domain setup.
1299 */ 1269 */
1300 while (--rd >= rp->domains) { 1270 while (--rd >= rp->domains) {
@@ -1305,7 +1275,7 @@ err_cleanup:
1305 return ret; 1275 return ret;
1306} 1276}
1307 1277
1308static int rapl_register_psys(void) 1278static int __init rapl_register_psys(void)
1309{ 1279{
1310 struct rapl_domain *rd; 1280 struct rapl_domain *rd;
1311 struct powercap_zone *power_zone; 1281 struct powercap_zone *power_zone;
@@ -1346,40 +1316,14 @@ static int rapl_register_psys(void)
1346 return 0; 1316 return 0;
1347} 1317}
1348 1318
1349static int rapl_register_powercap(void) 1319static int __init rapl_register_powercap(void)
1350{ 1320{
1351 struct rapl_domain *rd;
1352 struct rapl_package *rp;
1353 int ret = 0;
1354
1355 control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); 1321 control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
1356 if (IS_ERR(control_type)) { 1322 if (IS_ERR(control_type)) {
1357 pr_debug("failed to register powercap control_type.\n"); 1323 pr_debug("failed to register powercap control_type.\n");
1358 return PTR_ERR(control_type); 1324 return PTR_ERR(control_type);
1359 } 1325 }
1360 /* read the initial data */ 1326 return 0;
1361 rapl_update_domain_data();
1362 list_for_each_entry(rp, &rapl_packages, plist)
1363 if (rapl_package_register_powercap(rp))
1364 goto err_cleanup_package;
1365
1366 /* Don't bail out if PSys is not supported */
1367 rapl_register_psys();
1368
1369 return ret;
1370
1371err_cleanup_package:
1372 /* clean up previously initialized packages */
1373 list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) {
1374 for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
1375 rd++) {
1376 pr_debug("unregister zone/package %d, %s domain\n",
1377 rp->id, rd->name);
1378 powercap_unregister_zone(control_type, &rd->power_zone);
1379 }
1380 }
1381
1382 return ret;
1383} 1327}
1384 1328
1385static int rapl_check_domain(int cpu, int domain) 1329static int rapl_check_domain(int cpu, int domain)
@@ -1452,9 +1396,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd)
1452 */ 1396 */
1453static int rapl_detect_domains(struct rapl_package *rp, int cpu) 1397static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1454{ 1398{
1455 int i;
1456 int ret = 0;
1457 struct rapl_domain *rd; 1399 struct rapl_domain *rd;
1400 int i;
1458 1401
1459 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1402 for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1460 /* use physical package id to read counters */ 1403 /* use physical package id to read counters */
@@ -1466,84 +1409,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1466 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1409 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1467 if (!rp->nr_domains) { 1410 if (!rp->nr_domains) {
1468 pr_debug("no valid rapl domains found in package %d\n", rp->id); 1411 pr_debug("no valid rapl domains found in package %d\n", rp->id);
1469 ret = -ENODEV; 1412 return -ENODEV;
1470 goto done;
1471 } 1413 }
1472 pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id); 1414 pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id);
1473 1415
1474 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), 1416 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1475 GFP_KERNEL); 1417 GFP_KERNEL);
1476 if (!rp->domains) { 1418 if (!rp->domains)
1477 ret = -ENOMEM; 1419 return -ENOMEM;
1478 goto done; 1420
1479 }
1480 rapl_init_domains(rp); 1421 rapl_init_domains(rp);
1481 1422
1482 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) 1423 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
1483 rapl_detect_powerlimit(rd); 1424 rapl_detect_powerlimit(rd);
1484 1425
1485
1486
1487done:
1488 return ret;
1489}
1490
1491static bool is_package_new(int package)
1492{
1493 struct rapl_package *rp;
1494
1495 /* caller prevents cpu hotplug, there will be no new packages added
1496 * or deleted while traversing the package list, no need for locking.
1497 */
1498 list_for_each_entry(rp, &rapl_packages, plist)
1499 if (package == rp->id)
1500 return false;
1501
1502 return true;
1503}
1504
1505/* RAPL interface can be made of a two-level hierarchy: package level and domain
1506 * level. We first detect the number of packages then domains of each package.
1507 * We have to consider the possiblity of CPU online/offline due to hotplug and
1508 * other scenarios.
1509 */
1510static int rapl_detect_topology(void)
1511{
1512 int i;
1513 int phy_package_id;
1514 struct rapl_package *new_package, *rp;
1515
1516 for_each_online_cpu(i) {
1517 phy_package_id = topology_physical_package_id(i);
1518 if (is_package_new(phy_package_id)) {
1519 new_package = kzalloc(sizeof(*rp), GFP_KERNEL);
1520 if (!new_package) {
1521 rapl_cleanup_data();
1522 return -ENOMEM;
1523 }
1524 /* add the new package to the list */
1525 new_package->id = phy_package_id;
1526 new_package->nr_cpus = 1;
1527 /* use the first active cpu of the package to access */
1528 new_package->lead_cpu = i;
1529 /* check if the package contains valid domains */
1530 if (rapl_detect_domains(new_package, i) ||
1531 rapl_defaults->check_unit(new_package, i)) {
1532 kfree(new_package->domains);
1533 kfree(new_package);
1534 /* free up the packages already initialized */
1535 rapl_cleanup_data();
1536 return -ENODEV;
1537 }
1538 INIT_LIST_HEAD(&new_package->plist);
1539 list_add(&new_package->plist, &rapl_packages);
1540 } else {
1541 rp = find_package_by_id(phy_package_id);
1542 if (rp)
1543 ++rp->nr_cpus;
1544 }
1545 }
1546
1547 return 0; 1426 return 0;
1548} 1427}
1549 1428
@@ -1552,12 +1431,21 @@ static void rapl_remove_package(struct rapl_package *rp)
1552{ 1431{
1553 struct rapl_domain *rd, *rd_package = NULL; 1432 struct rapl_domain *rd, *rd_package = NULL;
1554 1433
1434 package_power_limit_irq_restore(rp);
1435
1555 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1436 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1437 rapl_write_data_raw(rd, PL1_ENABLE, 0);
1438 rapl_write_data_raw(rd, PL1_CLAMP, 0);
1439 if (find_nr_power_limit(rd) > 1) {
1440 rapl_write_data_raw(rd, PL2_ENABLE, 0);
1441 rapl_write_data_raw(rd, PL2_CLAMP, 0);
1442 }
1556 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1443 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1557 rd_package = rd; 1444 rd_package = rd;
1558 continue; 1445 continue;
1559 } 1446 }
1560 pr_debug("remove package %d, %s domain\n", rp->id, rd->name); 1447 pr_debug("remove package, undo power limit on %d: %s\n",
1448 rp->id, rd->name);
1561 powercap_unregister_zone(control_type, &rd->power_zone); 1449 powercap_unregister_zone(control_type, &rd->power_zone);
1562 } 1450 }
1563 /* do parent zone last */ 1451 /* do parent zone last */
@@ -1567,20 +1455,17 @@ static void rapl_remove_package(struct rapl_package *rp)
1567} 1455}
1568 1456
1569/* called from CPU hotplug notifier, hotplug lock held */ 1457/* called from CPU hotplug notifier, hotplug lock held */
1570static int rapl_add_package(int cpu) 1458static struct rapl_package *rapl_add_package(int cpu, int pkgid)
1571{ 1459{
1572 int ret = 0;
1573 int phy_package_id;
1574 struct rapl_package *rp; 1460 struct rapl_package *rp;
1461 int ret;
1575 1462
1576 phy_package_id = topology_physical_package_id(cpu);
1577 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); 1463 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1578 if (!rp) 1464 if (!rp)
1579 return -ENOMEM; 1465 return ERR_PTR(-ENOMEM);
1580 1466
1581 /* add the new package to the list */ 1467 /* add the new package to the list */
1582 rp->id = phy_package_id; 1468 rp->id = pkgid;
1583 rp->nr_cpus = 1;
1584 rp->lead_cpu = cpu; 1469 rp->lead_cpu = cpu;
1585 1470
1586 /* check if the package contains valid domains */ 1471 /* check if the package contains valid domains */
@@ -1589,17 +1474,17 @@ static int rapl_add_package(int cpu)
1589 ret = -ENODEV; 1474 ret = -ENODEV;
1590 goto err_free_package; 1475 goto err_free_package;
1591 } 1476 }
1592 if (!rapl_package_register_powercap(rp)) { 1477 ret = rapl_package_register_powercap(rp);
1478 if (!ret) {
1593 INIT_LIST_HEAD(&rp->plist); 1479 INIT_LIST_HEAD(&rp->plist);
1594 list_add(&rp->plist, &rapl_packages); 1480 list_add(&rp->plist, &rapl_packages);
1595 return ret; 1481 return rp;
1596 } 1482 }
1597 1483
1598err_free_package: 1484err_free_package:
1599 kfree(rp->domains); 1485 kfree(rp->domains);
1600 kfree(rp); 1486 kfree(rp);
1601 1487 return ERR_PTR(ret);
1602 return ret;
1603} 1488}
1604 1489
1605/* Handles CPU hotplug on multi-socket systems. 1490/* Handles CPU hotplug on multi-socket systems.
@@ -1609,55 +1494,46 @@ err_free_package:
1609 * associated domains. Cooling devices are handled accordingly at 1494 * associated domains. Cooling devices are handled accordingly at
1610 * per-domain level. 1495 * per-domain level.
1611 */ 1496 */
1612static int rapl_cpu_callback(struct notifier_block *nfb, 1497static int rapl_cpu_online(unsigned int cpu)
1613 unsigned long action, void *hcpu)
1614{ 1498{
1615 unsigned long cpu = (unsigned long)hcpu; 1499 int pkgid = topology_physical_package_id(cpu);
1616 int phy_package_id;
1617 struct rapl_package *rp; 1500 struct rapl_package *rp;
1618 int lead_cpu;
1619 1501
1620 phy_package_id = topology_physical_package_id(cpu); 1502 rp = find_package_by_id(pkgid);
1621 switch (action) { 1503 if (!rp) {
1622 case CPU_ONLINE: 1504 rp = rapl_add_package(cpu, pkgid);
1623 case CPU_ONLINE_FROZEN: 1505 if (IS_ERR(rp))
1624 case CPU_DOWN_FAILED: 1506 return PTR_ERR(rp);
1625 case CPU_DOWN_FAILED_FROZEN:
1626 rp = find_package_by_id(phy_package_id);
1627 if (rp)
1628 ++rp->nr_cpus;
1629 else
1630 rapl_add_package(cpu);
1631 break;
1632 case CPU_DOWN_PREPARE:
1633 case CPU_DOWN_PREPARE_FROZEN:
1634 rp = find_package_by_id(phy_package_id);
1635 if (!rp)
1636 break;
1637 if (--rp->nr_cpus == 0)
1638 rapl_remove_package(rp);
1639 else if (cpu == rp->lead_cpu) {
1640 /* choose another active cpu in the package */
1641 lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
1642 if (lead_cpu < nr_cpu_ids)
1643 rp->lead_cpu = lead_cpu;
1644 else /* should never go here */
1645 pr_err("no active cpu available for package %d\n",
1646 phy_package_id);
1647 }
1648 } 1507 }
1508 cpumask_set_cpu(cpu, &rp->cpumask);
1509 return 0;
1510}
1511
1512static int rapl_cpu_down_prep(unsigned int cpu)
1513{
1514 int pkgid = topology_physical_package_id(cpu);
1515 struct rapl_package *rp;
1516 int lead_cpu;
1517
1518 rp = find_package_by_id(pkgid);
1519 if (!rp)
1520 return 0;
1649 1521
1650 return NOTIFY_OK; 1522 cpumask_clear_cpu(cpu, &rp->cpumask);
1523 lead_cpu = cpumask_first(&rp->cpumask);
1524 if (lead_cpu >= nr_cpu_ids)
1525 rapl_remove_package(rp);
1526 else if (rp->lead_cpu == cpu)
1527 rp->lead_cpu = lead_cpu;
1528 return 0;
1651} 1529}
1652 1530
1653static struct notifier_block rapl_cpu_notifier = { 1531static enum cpuhp_state pcap_rapl_online;
1654 .notifier_call = rapl_cpu_callback,
1655};
1656 1532
1657static int __init rapl_init(void) 1533static int __init rapl_init(void)
1658{ 1534{
1659 int ret = 0;
1660 const struct x86_cpu_id *id; 1535 const struct x86_cpu_id *id;
1536 int ret;
1661 1537
1662 id = x86_match_cpu(rapl_ids); 1538 id = x86_match_cpu(rapl_ids);
1663 if (!id) { 1539 if (!id) {
@@ -1669,36 +1545,29 @@ static int __init rapl_init(void)
1669 1545
1670 rapl_defaults = (struct rapl_defaults *)id->driver_data; 1546 rapl_defaults = (struct rapl_defaults *)id->driver_data;
1671 1547
1672 cpu_notifier_register_begin(); 1548 ret = rapl_register_powercap();
1673
1674 /* prevent CPU hotplug during detection */
1675 get_online_cpus();
1676 ret = rapl_detect_topology();
1677 if (ret) 1549 if (ret)
1678 goto done; 1550 return ret;
1679 1551
1680 if (rapl_register_powercap()) { 1552 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
1681 rapl_cleanup_data(); 1553 rapl_cpu_online, rapl_cpu_down_prep);
1682 ret = -ENODEV; 1554 if (ret < 0)
1683 goto done; 1555 goto err_unreg;
1684 } 1556 pcap_rapl_online = ret;
1685 __register_hotcpu_notifier(&rapl_cpu_notifier); 1557
1686done: 1558 /* Don't bail out if PSys is not supported */
1687 put_online_cpus(); 1559 rapl_register_psys();
1688 cpu_notifier_register_done(); 1560 return 0;
1689 1561
1562err_unreg:
1563 rapl_unregister_powercap();
1690 return ret; 1564 return ret;
1691} 1565}
1692 1566
1693static void __exit rapl_exit(void) 1567static void __exit rapl_exit(void)
1694{ 1568{
1695 cpu_notifier_register_begin(); 1569 cpuhp_remove_state(pcap_rapl_online);
1696 get_online_cpus();
1697 __unregister_hotcpu_notifier(&rapl_cpu_notifier);
1698 rapl_unregister_powercap(); 1570 rapl_unregister_powercap();
1699 rapl_cleanup_data();
1700 put_online_cpus();
1701 cpu_notifier_register_done();
1702} 1571}
1703 1572
1704module_init(rapl_init); 1573module_init(rapl_init);
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 350cb5e22ff3..df64692e9e64 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -43,7 +43,6 @@
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/delay.h> 44#include <linux/delay.h>
45#include <linux/kthread.h> 45#include <linux/kthread.h>
46#include <linux/freezer.h>
47#include <linux/cpu.h> 46#include <linux/cpu.h>
48#include <linux/thermal.h> 47#include <linux/thermal.h>
49#include <linux/slab.h> 48#include <linux/slab.h>
@@ -85,11 +84,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
85 */ 84 */
86static bool clamping; 85static bool clamping;
87 86
87static const struct sched_param sparam = {
88 .sched_priority = MAX_USER_RT_PRIO / 2,
89};
90struct powerclamp_worker_data {
91 struct kthread_worker *worker;
92 struct kthread_work balancing_work;
93 struct kthread_delayed_work idle_injection_work;
94 unsigned int cpu;
95 unsigned int count;
96 unsigned int guard;
97 unsigned int window_size_now;
98 unsigned int target_ratio;
99 unsigned int duration_jiffies;
100 bool clamping;
101};
88 102
89static struct task_struct * __percpu *powerclamp_thread; 103static struct powerclamp_worker_data * __percpu worker_data;
90static struct thermal_cooling_device *cooling_dev; 104static struct thermal_cooling_device *cooling_dev;
91static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu 105static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
92 * clamping thread 106 * clamping kthread worker
93 */ 107 */
94 108
95static unsigned int duration; 109static unsigned int duration;
@@ -261,11 +275,6 @@ static u64 pkg_state_counter(void)
261 return count; 275 return count;
262} 276}
263 277
264static void noop_timer(unsigned long foo)
265{
266 /* empty... just the fact that we get the interrupt wakes us up */
267}
268
269static unsigned int get_compensation(int ratio) 278static unsigned int get_compensation(int ratio)
270{ 279{
271 unsigned int comp = 0; 280 unsigned int comp = 0;
@@ -367,103 +376,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
367 return set_target_ratio + guard <= current_ratio; 376 return set_target_ratio + guard <= current_ratio;
368} 377}
369 378
370static int clamp_thread(void *arg) 379static void clamp_balancing_func(struct kthread_work *work)
371{ 380{
372 int cpunr = (unsigned long)arg; 381 struct powerclamp_worker_data *w_data;
373 DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); 382 int sleeptime;
374 static const struct sched_param param = { 383 unsigned long target_jiffies;
375 .sched_priority = MAX_USER_RT_PRIO/2, 384 unsigned int compensated_ratio;
376 }; 385 int interval; /* jiffies to sleep for each attempt */
377 unsigned int count = 0;
378 unsigned int target_ratio;
379 386
380 set_bit(cpunr, cpu_clamping_mask); 387 w_data = container_of(work, struct powerclamp_worker_data,
381 set_freezable(); 388 balancing_work);
382 init_timer_on_stack(&wakeup_timer);
383 sched_setscheduler(current, SCHED_FIFO, &param);
384
385 while (true == clamping && !kthread_should_stop() &&
386 cpu_online(cpunr)) {
387 int sleeptime;
388 unsigned long target_jiffies;
389 unsigned int guard;
390 unsigned int compensated_ratio;
391 int interval; /* jiffies to sleep for each attempt */
392 unsigned int duration_jiffies = msecs_to_jiffies(duration);
393 unsigned int window_size_now;
394
395 try_to_freeze();
396 /*
397 * make sure user selected ratio does not take effect until
398 * the next round. adjust target_ratio if user has changed
399 * target such that we can converge quickly.
400 */
401 target_ratio = set_target_ratio;
402 guard = 1 + target_ratio/20;
403 window_size_now = window_size;
404 count++;
405
406 /*
407 * systems may have different ability to enter package level
408 * c-states, thus we need to compensate the injected idle ratio
409 * to achieve the actual target reported by the HW.
410 */
411 compensated_ratio = target_ratio +
412 get_compensation(target_ratio);
413 if (compensated_ratio <= 0)
414 compensated_ratio = 1;
415 interval = duration_jiffies * 100 / compensated_ratio;
416
417 /* align idle time */
418 target_jiffies = roundup(jiffies, interval);
419 sleeptime = target_jiffies - jiffies;
420 if (sleeptime <= 0)
421 sleeptime = 1;
422 schedule_timeout_interruptible(sleeptime);
423 /*
424 * only elected controlling cpu can collect stats and update
425 * control parameters.
426 */
427 if (cpunr == control_cpu && !(count%window_size_now)) {
428 should_skip =
429 powerclamp_adjust_controls(target_ratio,
430 guard, window_size_now);
431 smp_mb();
432 }
433 389
434 if (should_skip) 390 /*
435 continue; 391 * make sure user selected ratio does not take effect until
436 392 * the next round. adjust target_ratio if user has changed
437 target_jiffies = jiffies + duration_jiffies; 393 * target such that we can converge quickly.
438 mod_timer(&wakeup_timer, target_jiffies); 394 */
439 if (unlikely(local_softirq_pending())) 395 w_data->target_ratio = READ_ONCE(set_target_ratio);
440 continue; 396 w_data->guard = 1 + w_data->target_ratio / 20;
441 /* 397 w_data->window_size_now = window_size;
442 * stop tick sched during idle time, interrupts are still 398 w_data->duration_jiffies = msecs_to_jiffies(duration);
443 * allowed. thus jiffies are updated properly. 399 w_data->count++;
444 */ 400
445 preempt_disable(); 401 /*
446 /* mwait until target jiffies is reached */ 402 * systems may have different ability to enter package level
447 while (time_before(jiffies, target_jiffies)) { 403 * c-states, thus we need to compensate the injected idle ratio
448 unsigned long ecx = 1; 404 * to achieve the actual target reported by the HW.
449 unsigned long eax = target_mwait; 405 */
450 406 compensated_ratio = w_data->target_ratio +
451 /* 407 get_compensation(w_data->target_ratio);
452 * REVISIT: may call enter_idle() to notify drivers who 408 if (compensated_ratio <= 0)
453 * can save power during cpu idle. same for exit_idle() 409 compensated_ratio = 1;
454 */ 410 interval = w_data->duration_jiffies * 100 / compensated_ratio;
455 local_touch_nmi(); 411
456 stop_critical_timings(); 412 /* align idle time */
457 mwait_idle_with_hints(eax, ecx); 413 target_jiffies = roundup(jiffies, interval);
458 start_critical_timings(); 414 sleeptime = target_jiffies - jiffies;
459 atomic_inc(&idle_wakeup_counter); 415 if (sleeptime <= 0)
460 } 416 sleeptime = 1;
461 preempt_enable(); 417
418 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419 kthread_queue_delayed_work(w_data->worker,
420 &w_data->idle_injection_work,
421 sleeptime);
422}
423
424static void clamp_idle_injection_func(struct kthread_work *work)
425{
426 struct powerclamp_worker_data *w_data;
427
428 w_data = container_of(work, struct powerclamp_worker_data,
429 idle_injection_work.work);
430
431 /*
432 * only elected controlling cpu can collect stats and update
433 * control parameters.
434 */
435 if (w_data->cpu == control_cpu &&
436 !(w_data->count % w_data->window_size_now)) {
437 should_skip =
438 powerclamp_adjust_controls(w_data->target_ratio,
439 w_data->guard,
440 w_data->window_size_now);
441 smp_mb();
462 } 442 }
463 del_timer_sync(&wakeup_timer);
464 clear_bit(cpunr, cpu_clamping_mask);
465 443
466 return 0; 444 if (should_skip)
445 goto balance;
446
447 play_idle(jiffies_to_msecs(w_data->duration_jiffies));
448
449balance:
450 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
451 kthread_queue_work(w_data->worker, &w_data->balancing_work);
467} 452}
468 453
469/* 454/*
@@ -507,10 +492,60 @@ static void poll_pkg_cstate(struct work_struct *dummy)
507 schedule_delayed_work(&poll_pkg_cstate_work, HZ); 492 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
508} 493}
509 494
495static void start_power_clamp_worker(unsigned long cpu)
496{
497 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
498 struct kthread_worker *worker;
499
500 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
501 if (IS_ERR(worker))
502 return;
503
504 w_data->worker = worker;
505 w_data->count = 0;
506 w_data->cpu = cpu;
507 w_data->clamping = true;
508 set_bit(cpu, cpu_clamping_mask);
509 sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
510 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
511 kthread_init_delayed_work(&w_data->idle_injection_work,
512 clamp_idle_injection_func);
513 kthread_queue_work(w_data->worker, &w_data->balancing_work);
514}
515
516static void stop_power_clamp_worker(unsigned long cpu)
517{
518 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
519
520 if (!w_data->worker)
521 return;
522
523 w_data->clamping = false;
524 /*
525 * Make sure that all works that get queued after this point see
526 * the clamping disabled. The counter part is not needed because
527 * there is an implicit memory barrier when the queued work
528 * is proceed.
529 */
530 smp_wmb();
531 kthread_cancel_work_sync(&w_data->balancing_work);
532 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
533 /*
534 * The balancing work still might be queued here because
535 * the handling of the "clapming" variable, cancel, and queue
536 * operations are not synchronized via a lock. But it is not
537 * a big deal. The balancing work is fast and destroy kthread
538 * will wait for it.
539 */
540 clear_bit(w_data->cpu, cpu_clamping_mask);
541 kthread_destroy_worker(w_data->worker);
542
543 w_data->worker = NULL;
544}
545
510static int start_power_clamp(void) 546static int start_power_clamp(void)
511{ 547{
512 unsigned long cpu; 548 unsigned long cpu;
513 struct task_struct *thread;
514 549
515 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); 550 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
516 /* prevent cpu hotplug */ 551 /* prevent cpu hotplug */
@@ -524,22 +559,9 @@ static int start_power_clamp(void)
524 clamping = true; 559 clamping = true;
525 schedule_delayed_work(&poll_pkg_cstate_work, 0); 560 schedule_delayed_work(&poll_pkg_cstate_work, 0);
526 561
527 /* start one thread per online cpu */ 562 /* start one kthread worker per online cpu */
528 for_each_online_cpu(cpu) { 563 for_each_online_cpu(cpu) {
529 struct task_struct **p = 564 start_power_clamp_worker(cpu);
530 per_cpu_ptr(powerclamp_thread, cpu);
531
532 thread = kthread_create_on_node(clamp_thread,
533 (void *) cpu,
534 cpu_to_node(cpu),
535 "kidle_inject/%ld", cpu);
536 /* bind to cpu here */
537 if (likely(!IS_ERR(thread))) {
538 kthread_bind(thread, cpu);
539 wake_up_process(thread);
540 *p = thread;
541 }
542
543 } 565 }
544 put_online_cpus(); 566 put_online_cpus();
545 567
@@ -549,71 +571,49 @@ static int start_power_clamp(void)
549static void end_power_clamp(void) 571static void end_power_clamp(void)
550{ 572{
551 int i; 573 int i;
552 struct task_struct *thread;
553 574
554 clamping = false;
555 /* 575 /*
556 * make clamping visible to other cpus and give per cpu clamping threads 576 * Block requeuing in all the kthread workers. They will flush and
557 * sometime to exit, or gets killed later. 577 * stop faster.
558 */ 578 */
559 smp_mb(); 579 clamping = false;
560 msleep(20);
561 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { 580 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
562 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { 581 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
563 pr_debug("clamping thread for cpu %d alive, kill\n", i); 582 pr_debug("clamping worker for cpu %d alive, destroy\n",
564 thread = *per_cpu_ptr(powerclamp_thread, i); 583 i);
565 kthread_stop(thread); 584 stop_power_clamp_worker(i);
566 } 585 }
567 } 586 }
568} 587}
569 588
570static int powerclamp_cpu_callback(struct notifier_block *nfb, 589static int powerclamp_cpu_online(unsigned int cpu)
571 unsigned long action, void *hcpu)
572{ 590{
573 unsigned long cpu = (unsigned long)hcpu; 591 if (clamping == false)
574 struct task_struct *thread; 592 return 0;
575 struct task_struct **percpu_thread = 593 start_power_clamp_worker(cpu);
576 per_cpu_ptr(powerclamp_thread, cpu); 594 /* prefer BSP as controlling CPU */
577 595 if (cpu == 0) {
578 if (false == clamping) 596 control_cpu = 0;
579 goto exit_ok; 597 smp_mb();
580
581 switch (action) {
582 case CPU_ONLINE:
583 thread = kthread_create_on_node(clamp_thread,
584 (void *) cpu,
585 cpu_to_node(cpu),
586 "kidle_inject/%lu", cpu);
587 if (likely(!IS_ERR(thread))) {
588 kthread_bind(thread, cpu);
589 wake_up_process(thread);
590 *percpu_thread = thread;
591 }
592 /* prefer BSP as controlling CPU */
593 if (cpu == 0) {
594 control_cpu = 0;
595 smp_mb();
596 }
597 break;
598 case CPU_DEAD:
599 if (test_bit(cpu, cpu_clamping_mask)) {
600 pr_err("cpu %lu dead but powerclamping thread is not\n",
601 cpu);
602 kthread_stop(*percpu_thread);
603 }
604 if (cpu == control_cpu) {
605 control_cpu = smp_processor_id();
606 smp_mb();
607 }
608 } 598 }
609 599 return 0;
610exit_ok:
611 return NOTIFY_OK;
612} 600}
613 601
614static struct notifier_block powerclamp_cpu_notifier = { 602static int powerclamp_cpu_predown(unsigned int cpu)
615 .notifier_call = powerclamp_cpu_callback, 603{
616}; 604 if (clamping == false)
605 return 0;
606
607 stop_power_clamp_worker(cpu);
608 if (cpu != control_cpu)
609 return 0;
610
611 control_cpu = cpumask_first(cpu_online_mask);
612 if (control_cpu == cpu)
613 control_cpu = cpumask_next(cpu, cpu_online_mask);
614 smp_mb();
615 return 0;
616}
617 617
618static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, 618static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
619 unsigned long *state) 619 unsigned long *state)
@@ -741,6 +741,8 @@ file_error:
741 debugfs_remove_recursive(debug_dir); 741 debugfs_remove_recursive(debug_dir);
742} 742}
743 743
744static enum cpuhp_state hp_state;
745
744static int __init powerclamp_init(void) 746static int __init powerclamp_init(void)
745{ 747{
746 int retval; 748 int retval;
@@ -758,10 +760,17 @@ static int __init powerclamp_init(void)
758 760
759 /* set default limit, maybe adjusted during runtime based on feedback */ 761 /* set default limit, maybe adjusted during runtime based on feedback */
760 window_size = 2; 762 window_size = 2;
761 register_hotcpu_notifier(&powerclamp_cpu_notifier); 763 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
764 "thermal/intel_powerclamp:online",
765 powerclamp_cpu_online,
766 powerclamp_cpu_predown);
767 if (retval < 0)
768 goto exit_free;
769
770 hp_state = retval;
762 771
763 powerclamp_thread = alloc_percpu(struct task_struct *); 772 worker_data = alloc_percpu(struct powerclamp_worker_data);
764 if (!powerclamp_thread) { 773 if (!worker_data) {
765 retval = -ENOMEM; 774 retval = -ENOMEM;
766 goto exit_unregister; 775 goto exit_unregister;
767 } 776 }
@@ -781,9 +790,9 @@ static int __init powerclamp_init(void)
781 return 0; 790 return 0;
782 791
783exit_free_thread: 792exit_free_thread:
784 free_percpu(powerclamp_thread); 793 free_percpu(worker_data);
785exit_unregister: 794exit_unregister:
786 unregister_hotcpu_notifier(&powerclamp_cpu_notifier); 795 cpuhp_remove_state_nocalls(hp_state);
787exit_free: 796exit_free:
788 kfree(cpu_clamping_mask); 797 kfree(cpu_clamping_mask);
789 return retval; 798 return retval;
@@ -792,9 +801,9 @@ module_init(powerclamp_init);
792 801
793static void __exit powerclamp_exit(void) 802static void __exit powerclamp_exit(void)
794{ 803{
795 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
796 end_power_clamp(); 804 end_power_clamp();
797 free_percpu(powerclamp_thread); 805 cpuhp_remove_state_nocalls(hp_state);
806 free_percpu(worker_data);
798 thermal_cooling_device_unregister(cooling_dev); 807 thermal_cooling_device_unregister(cooling_dev);
799 kfree(cpu_clamping_mask); 808 kfree(cpu_clamping_mask);
800 809
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index f3db11c24654..c1ba00fc4888 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -249,6 +249,7 @@ extern int acpi_processor_register_performance(struct acpi_processor_performance
249 *performance, unsigned int cpu); 249 *performance, unsigned int cpu);
250extern void acpi_processor_unregister_performance(unsigned int cpu); 250extern void acpi_processor_unregister_performance(unsigned int cpu);
251 251
252int acpi_processor_pstate_control(void);
252/* note: this locks both the calling module and the processor module 253/* note: this locks both the calling module and the processor module
253 if a _PPC object exists, rmmod is disallowed then */ 254 if a _PPC object exists, rmmod is disallowed then */
254int acpi_processor_notify_smm(struct module *calling_module); 255int acpi_processor_notify_smm(struct module *calling_module);
@@ -294,7 +295,7 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
294#ifdef CONFIG_CPU_FREQ 295#ifdef CONFIG_CPU_FREQ
295void acpi_processor_ppc_init(void); 296void acpi_processor_ppc_init(void);
296void acpi_processor_ppc_exit(void); 297void acpi_processor_ppc_exit(void);
297int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag); 298void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
298extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit); 299extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit);
299#else 300#else
300static inline void acpi_processor_ppc_init(void) 301static inline void acpi_processor_ppc_init(void)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e571128ad99a..09807c2ce328 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -238,6 +238,8 @@ void arch_cpu_idle_dead(void);
238int cpu_report_state(int cpu); 238int cpu_report_state(int cpu);
239int cpu_check_up_prepare(int cpu); 239int cpu_check_up_prepare(int cpu);
240void cpu_set_state_online(int cpu); 240void cpu_set_state_online(int cpu);
241void play_idle(unsigned long duration_ms);
242
241#ifdef CONFIG_HOTPLUG_CPU 243#ifdef CONFIG_HOTPLUG_CPU
242bool cpu_wait_death(unsigned int cpu, int seconds); 244bool cpu_wait_death(unsigned int cpu, int seconds);
243bool cpu_report_death(void); 245bool cpu_report_death(void);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 32dc0cbd51ca..7e05c5e4e45c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -175,7 +175,7 @@ void disable_cpufreq(void);
175 175
176u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); 176u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
177int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); 177int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
178int cpufreq_update_policy(unsigned int cpu); 178void cpufreq_update_policy(unsigned int cpu);
179bool have_governor_per_policy(void); 179bool have_governor_per_policy(void);
180struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); 180struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
181void cpufreq_enable_fast_switch(struct cpufreq_policy *policy); 181void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
@@ -234,6 +234,10 @@ __ATTR(_name, _perm, show_##_name, NULL)
234static struct freq_attr _name = \ 234static struct freq_attr _name = \
235__ATTR(_name, 0644, show_##_name, store_##_name) 235__ATTR(_name, 0644, show_##_name, store_##_name)
236 236
237#define cpufreq_freq_attr_wo(_name) \
238static struct freq_attr _name = \
239__ATTR(_name, 0200, NULL, store_##_name)
240
237struct global_attr { 241struct global_attr {
238 struct attribute attr; 242 struct attribute attr;
239 ssize_t (*show)(struct kobject *kobj, 243 ssize_t (*show)(struct kobject *kobj,
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373c3478..da346f2817a8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -74,6 +74,7 @@ struct cpuidle_driver_kobj;
74struct cpuidle_device { 74struct cpuidle_device {
75 unsigned int registered:1; 75 unsigned int registered:1;
76 unsigned int enabled:1; 76 unsigned int enabled:1;
77 unsigned int use_deepest_state:1;
77 unsigned int cpu; 78 unsigned int cpu;
78 79
79 int last_residency; 80 int last_residency;
@@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
192static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } 193static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
193#endif 194#endif
194 195
195#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) 196#ifdef CONFIG_CPU_IDLE
196extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, 197extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
197 struct cpuidle_device *dev); 198 struct cpuidle_device *dev);
198extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, 199extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
199 struct cpuidle_device *dev); 200 struct cpuidle_device *dev);
201extern void cpuidle_use_deepest_state(bool enable);
200#else 202#else
201static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, 203static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
202 struct cpuidle_device *dev) 204 struct cpuidle_device *dev)
@@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
204static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, 206static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
205 struct cpuidle_device *dev) 207 struct cpuidle_device *dev)
206{return -ENODEV; } 208{return -ENODEV; }
209static inline void cpuidle_use_deepest_state(bool enable)
210{
211}
207#endif 212#endif
208 213
209/* kernel/sched/idle.c */ 214/* kernel/sched/idle.c */
@@ -235,8 +240,6 @@ struct cpuidle_governor {
235 int (*select) (struct cpuidle_driver *drv, 240 int (*select) (struct cpuidle_driver *drv,
236 struct cpuidle_device *dev); 241 struct cpuidle_device *dev);
237 void (*reflect) (struct cpuidle_device *dev, int index); 242 void (*reflect) (struct cpuidle_device *dev, int index);
238
239 struct module *owner;
240}; 243};
241 244
242#ifdef CONFIG_CPU_IDLE 245#ifdef CONFIG_CPU_IDLE
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index a09fe5c009c8..81ece61075df 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -15,11 +15,11 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/of.h> 16#include <linux/of.h>
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/spinlock.h>
18 19
19/* Defines used for the flags field in the struct generic_pm_domain */ 20/* Defines used for the flags field in the struct generic_pm_domain */
20#define GENPD_FLAG_PM_CLK (1U << 0) /* PM domain uses PM clk */ 21#define GENPD_FLAG_PM_CLK (1U << 0) /* PM domain uses PM clk */
21 22#define GENPD_FLAG_IRQ_SAFE (1U << 1) /* PM domain operates in atomic */
22#define GENPD_MAX_NUM_STATES 8 /* Number of possible low power states */
23 23
24enum gpd_status { 24enum gpd_status {
25 GPD_STATE_ACTIVE = 0, /* PM domain is active */ 25 GPD_STATE_ACTIVE = 0, /* PM domain is active */
@@ -40,15 +40,18 @@ struct gpd_dev_ops {
40struct genpd_power_state { 40struct genpd_power_state {
41 s64 power_off_latency_ns; 41 s64 power_off_latency_ns;
42 s64 power_on_latency_ns; 42 s64 power_on_latency_ns;
43 s64 residency_ns;
44 struct fwnode_handle *fwnode;
43}; 45};
44 46
47struct genpd_lock_ops;
48
45struct generic_pm_domain { 49struct generic_pm_domain {
46 struct dev_pm_domain domain; /* PM domain operations */ 50 struct dev_pm_domain domain; /* PM domain operations */
47 struct list_head gpd_list_node; /* Node in the global PM domains list */ 51 struct list_head gpd_list_node; /* Node in the global PM domains list */
48 struct list_head master_links; /* Links with PM domain as a master */ 52 struct list_head master_links; /* Links with PM domain as a master */
49 struct list_head slave_links; /* Links with PM domain as a slave */ 53 struct list_head slave_links; /* Links with PM domain as a slave */
50 struct list_head dev_list; /* List of devices */ 54 struct list_head dev_list; /* List of devices */
51 struct mutex lock;
52 struct dev_power_governor *gov; 55 struct dev_power_governor *gov;
53 struct work_struct power_off_work; 56 struct work_struct power_off_work;
54 struct fwnode_handle *provider; /* Identity of the domain provider */ 57 struct fwnode_handle *provider; /* Identity of the domain provider */
@@ -70,9 +73,18 @@ struct generic_pm_domain {
70 void (*detach_dev)(struct generic_pm_domain *domain, 73 void (*detach_dev)(struct generic_pm_domain *domain,
71 struct device *dev); 74 struct device *dev);
72 unsigned int flags; /* Bit field of configs for genpd */ 75 unsigned int flags; /* Bit field of configs for genpd */
73 struct genpd_power_state states[GENPD_MAX_NUM_STATES]; 76 struct genpd_power_state *states;
74 unsigned int state_count; /* number of states */ 77 unsigned int state_count; /* number of states */
75 unsigned int state_idx; /* state that genpd will go to when off */ 78 unsigned int state_idx; /* state that genpd will go to when off */
79 void *free; /* Free the state that was allocated for default */
80 const struct genpd_lock_ops *lock_ops;
81 union {
82 struct mutex mlock;
83 struct {
84 spinlock_t slock;
85 unsigned long lock_flags;
86 };
87 };
76 88
77}; 89};
78 90
@@ -205,6 +217,8 @@ extern int of_genpd_add_device(struct of_phandle_args *args,
205extern int of_genpd_add_subdomain(struct of_phandle_args *parent, 217extern int of_genpd_add_subdomain(struct of_phandle_args *parent,
206 struct of_phandle_args *new_subdomain); 218 struct of_phandle_args *new_subdomain);
207extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np); 219extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
220extern int of_genpd_parse_idle_states(struct device_node *dn,
221 struct genpd_power_state **states, int *n);
208 222
209int genpd_dev_pm_attach(struct device *dev); 223int genpd_dev_pm_attach(struct device *dev);
210#else /* !CONFIG_PM_GENERIC_DOMAINS_OF */ 224#else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
@@ -234,6 +248,12 @@ static inline int of_genpd_add_subdomain(struct of_phandle_args *parent,
234 return -ENODEV; 248 return -ENODEV;
235} 249}
236 250
251static inline int of_genpd_parse_idle_states(struct device_node *dn,
252 struct genpd_power_state **states, int *n)
253{
254 return -ENODEV;
255}
256
237static inline int genpd_dev_pm_attach(struct device *dev) 257static inline int genpd_dev_pm_attach(struct device *dev)
238{ 258{
239 return -ENODEV; 259 return -ENODEV;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index bca26157f5b6..0edd88f93904 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -17,13 +17,65 @@
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19 19
20struct clk;
21struct regulator;
20struct dev_pm_opp; 22struct dev_pm_opp;
21struct device; 23struct device;
24struct opp_table;
22 25
23enum dev_pm_opp_event { 26enum dev_pm_opp_event {
24 OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE, 27 OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE,
25}; 28};
26 29
30/**
31 * struct dev_pm_opp_supply - Power supply voltage/current values
32 * @u_volt: Target voltage in microvolts corresponding to this OPP
33 * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP
34 * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP
35 * @u_amp: Maximum current drawn by the device in microamperes
36 *
37 * This structure stores the voltage/current values for a single power supply.
38 */
39struct dev_pm_opp_supply {
40 unsigned long u_volt;
41 unsigned long u_volt_min;
42 unsigned long u_volt_max;
43 unsigned long u_amp;
44};
45
46/**
47 * struct dev_pm_opp_info - OPP freq/voltage/current values
48 * @rate: Target clk rate in hz
49 * @supplies: Array of voltage/current values for all power supplies
50 *
51 * This structure stores the freq/voltage/current values for a single OPP.
52 */
53struct dev_pm_opp_info {
54 unsigned long rate;
55 struct dev_pm_opp_supply *supplies;
56};
57
58/**
59 * struct dev_pm_set_opp_data - Set OPP data
60 * @old_opp: Old OPP info
61 * @new_opp: New OPP info
62 * @regulators: Array of regulator pointers
63 * @regulator_count: Number of regulators
64 * @clk: Pointer to clk
65 * @dev: Pointer to the struct device
66 *
67 * This structure contains all information required for setting an OPP.
68 */
69struct dev_pm_set_opp_data {
70 struct dev_pm_opp_info old_opp;
71 struct dev_pm_opp_info new_opp;
72
73 struct regulator **regulators;
74 unsigned int regulator_count;
75 struct clk *clk;
76 struct device *dev;
77};
78
27#if defined(CONFIG_PM_OPP) 79#if defined(CONFIG_PM_OPP)
28 80
29unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp); 81unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
@@ -62,8 +114,10 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
62void dev_pm_opp_put_supported_hw(struct device *dev); 114void dev_pm_opp_put_supported_hw(struct device *dev);
63int dev_pm_opp_set_prop_name(struct device *dev, const char *name); 115int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
64void dev_pm_opp_put_prop_name(struct device *dev); 116void dev_pm_opp_put_prop_name(struct device *dev);
65int dev_pm_opp_set_regulator(struct device *dev, const char *name); 117struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
66void dev_pm_opp_put_regulator(struct device *dev); 118void dev_pm_opp_put_regulators(struct opp_table *opp_table);
119int dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
120void dev_pm_opp_register_put_opp_helper(struct device *dev);
67int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); 121int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
68int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask); 122int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
69int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask); 123int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -163,6 +217,14 @@ static inline int dev_pm_opp_set_supported_hw(struct device *dev,
163 217
164static inline void dev_pm_opp_put_supported_hw(struct device *dev) {} 218static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
165 219
220static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
221 int (*set_opp)(struct dev_pm_set_opp_data *data))
222{
223 return -ENOTSUPP;
224}
225
226static inline void dev_pm_opp_register_put_opp_helper(struct device *dev) {}
227
166static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name) 228static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
167{ 229{
168 return -ENOTSUPP; 230 return -ENOTSUPP;
@@ -170,12 +232,12 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
170 232
171static inline void dev_pm_opp_put_prop_name(struct device *dev) {} 233static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
172 234
173static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name) 235static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
174{ 236{
175 return -ENOTSUPP; 237 return ERR_PTR(-ENOTSUPP);
176} 238}
177 239
178static inline void dev_pm_opp_put_regulator(struct device *dev) {} 240static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
179 241
180static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) 242static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
181{ 243{
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2e14d2667b6c..4957fc185ea9 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -61,12 +61,6 @@ static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
61 dev->power.ignore_children = enable; 61 dev->power.ignore_children = enable;
62} 62}
63 63
64static inline bool pm_children_suspended(struct device *dev)
65{
66 return dev->power.ignore_children
67 || !atomic_read(&dev->power.child_count);
68}
69
70static inline void pm_runtime_get_noresume(struct device *dev) 64static inline void pm_runtime_get_noresume(struct device *dev)
71{ 65{
72 atomic_inc(&dev->power.usage_count); 66 atomic_inc(&dev->power.usage_count);
@@ -162,7 +156,6 @@ static inline void pm_runtime_allow(struct device *dev) {}
162static inline void pm_runtime_forbid(struct device *dev) {} 156static inline void pm_runtime_forbid(struct device *dev) {}
163 157
164static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} 158static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
165static inline bool pm_children_suspended(struct device *dev) { return false; }
166static inline void pm_runtime_get_noresume(struct device *dev) {} 159static inline void pm_runtime_get_noresume(struct device *dev) {}
167static inline void pm_runtime_put_noidle(struct device *dev) {} 160static inline void pm_runtime_put_noidle(struct device *dev) {}
168static inline bool device_run_wake(struct device *dev) { return false; } 161static inline bool device_run_wake(struct device *dev) { return false; }
@@ -265,9 +258,9 @@ static inline int pm_runtime_set_active(struct device *dev)
265 return __pm_runtime_set_status(dev, RPM_ACTIVE); 258 return __pm_runtime_set_status(dev, RPM_ACTIVE);
266} 259}
267 260
268static inline void pm_runtime_set_suspended(struct device *dev) 261static inline int pm_runtime_set_suspended(struct device *dev)
269{ 262{
270 __pm_runtime_set_status(dev, RPM_SUSPENDED); 263 return __pm_runtime_set_status(dev, RPM_SUSPENDED);
271} 264}
272 265
273static inline void pm_runtime_disable(struct device *dev) 266static inline void pm_runtime_disable(struct device *dev)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0e90f2973719..5ccbbfe41345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2287,6 +2287,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
2287/* 2287/*
2288 * Per process flags 2288 * Per process flags
2289 */ 2289 */
2290#define PF_IDLE 0x00000002 /* I am an IDLE thread */
2290#define PF_EXITING 0x00000004 /* getting shut down */ 2291#define PF_EXITING 0x00000004 /* getting shut down */
2291#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 2292#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
2292#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 2293#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
@@ -2648,7 +2649,7 @@ extern struct task_struct *idle_task(int cpu);
2648 */ 2649 */
2649static inline bool is_idle_task(const struct task_struct *p) 2650static inline bool is_idle_task(const struct task_struct *p)
2650{ 2651{
2651 return p->pid == 0; 2652 return !!(p->flags & PF_IDLE);
2652} 2653}
2653extern struct task_struct *curr_task(int cpu); 2654extern struct task_struct *curr_task(int cpu);
2654extern void ia64_set_curr_task(int cpu, struct task_struct *p); 2655extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0c729c3c8549 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -194,6 +194,8 @@ struct platform_freeze_ops {
194}; 194};
195 195
196#ifdef CONFIG_SUSPEND 196#ifdef CONFIG_SUSPEND
197extern suspend_state_t mem_sleep_default;
198
197/** 199/**
198 * suspend_set_ops - set platform dependent suspend operations 200 * suspend_set_ops - set platform dependent suspend operations
199 * @ops: The new suspend operations to set. 201 * @ops: The new suspend operations to set.
diff --git a/kernel/fork.c b/kernel/fork.c
index 7377f414f3ce..a439ac429669 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1544,7 +1544,7 @@ static __latent_entropy struct task_struct *copy_process(
1544 goto bad_fork_cleanup_count; 1544 goto bad_fork_cleanup_count;
1545 1545
1546 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1546 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1547 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1547 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
1548 p->flags |= PF_FORKNOEXEC; 1548 p->flags |= PF_FORKNOEXEC;
1549 INIT_LIST_HEAD(&p->children); 1549 INIT_LIST_HEAD(&p->children);
1550 INIT_LIST_HEAD(&p->sibling); 1550 INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 281a697fd458..d401c21136d1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
78 78
79power_attr(pm_async); 79power_attr(pm_async);
80 80
81#ifdef CONFIG_SUSPEND
82static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
83 char *buf)
84{
85 char *s = buf;
86 suspend_state_t i;
87
88 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
89 if (mem_sleep_states[i]) {
90 const char *label = mem_sleep_states[i];
91
92 if (mem_sleep_current == i)
93 s += sprintf(s, "[%s] ", label);
94 else
95 s += sprintf(s, "%s ", label);
96 }
97
98 /* Convert the last space to a newline if needed. */
99 if (s != buf)
100 *(s-1) = '\n';
101
102 return (s - buf);
103}
104
105static suspend_state_t decode_suspend_state(const char *buf, size_t n)
106{
107 suspend_state_t state;
108 char *p;
109 int len;
110
111 p = memchr(buf, '\n', n);
112 len = p ? p - buf : n;
113
114 for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
115 const char *label = mem_sleep_states[state];
116
117 if (label && len == strlen(label) && !strncmp(buf, label, len))
118 return state;
119 }
120
121 return PM_SUSPEND_ON;
122}
123
124static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr,
125 const char *buf, size_t n)
126{
127 suspend_state_t state;
128 int error;
129
130 error = pm_autosleep_lock();
131 if (error)
132 return error;
133
134 if (pm_autosleep_state() > PM_SUSPEND_ON) {
135 error = -EBUSY;
136 goto out;
137 }
138
139 state = decode_suspend_state(buf, n);
140 if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
141 mem_sleep_current = state;
142 else
143 error = -EINVAL;
144
145 out:
146 pm_autosleep_unlock();
147 return error ? error : n;
148}
149
150power_attr(mem_sleep);
151#endif /* CONFIG_SUSPEND */
152
81#ifdef CONFIG_PM_DEBUG 153#ifdef CONFIG_PM_DEBUG
82int pm_test_level = TEST_NONE; 154int pm_test_level = TEST_NONE;
83 155
@@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
368 } 440 }
369 441
370 state = decode_state(buf, n); 442 state = decode_state(buf, n);
371 if (state < PM_SUSPEND_MAX) 443 if (state < PM_SUSPEND_MAX) {
444 if (state == PM_SUSPEND_MEM)
445 state = mem_sleep_current;
446
372 error = pm_suspend(state); 447 error = pm_suspend(state);
373 else if (state == PM_SUSPEND_MAX) 448 } else if (state == PM_SUSPEND_MAX) {
374 error = hibernate(); 449 error = hibernate();
375 else 450 } else {
376 error = -EINVAL; 451 error = -EINVAL;
452 }
377 453
378 out: 454 out:
379 pm_autosleep_unlock(); 455 pm_autosleep_unlock();
@@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj,
485 && strcmp(buf, "off") && strcmp(buf, "off\n")) 561 && strcmp(buf, "off") && strcmp(buf, "off\n"))
486 return -EINVAL; 562 return -EINVAL;
487 563
564 if (state == PM_SUSPEND_MEM)
565 state = mem_sleep_current;
566
488 error = pm_autosleep_set_state(state); 567 error = pm_autosleep_set_state(state);
489 return error ? error : n; 568 return error ? error : n;
490} 569}
@@ -602,6 +681,9 @@ static struct attribute * g[] = {
602#ifdef CONFIG_PM_SLEEP 681#ifdef CONFIG_PM_SLEEP
603 &pm_async_attr.attr, 682 &pm_async_attr.attr,
604 &wakeup_count_attr.attr, 683 &wakeup_count_attr.attr,
684#ifdef CONFIG_SUSPEND
685 &mem_sleep_attr.attr,
686#endif
605#ifdef CONFIG_PM_AUTOSLEEP 687#ifdef CONFIG_PM_AUTOSLEEP
606 &autosleep_attr.attr, 688 &autosleep_attr.attr,
607#endif 689#endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 56d1d0dedf76..1dfa0da827d3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
189 189
190#ifdef CONFIG_SUSPEND 190#ifdef CONFIG_SUSPEND
191/* kernel/power/suspend.c */ 191/* kernel/power/suspend.c */
192extern const char *pm_labels[]; 192extern const char * const pm_labels[];
193extern const char *pm_states[]; 193extern const char *pm_states[];
194extern const char *mem_sleep_states[];
195extern suspend_state_t mem_sleep_current;
194 196
195extern int suspend_devices_and_enter(suspend_state_t state); 197extern int suspend_devices_and_enter(suspend_state_t state);
196#else /* !CONFIG_SUSPEND */ 198#else /* !CONFIG_SUSPEND */
199#define mem_sleep_current PM_SUSPEND_ON
200
197static inline int suspend_devices_and_enter(suspend_state_t state) 201static inline int suspend_devices_and_enter(suspend_state_t state)
198{ 202{
199 return -ENOSYS; 203 return -ENOSYS;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6ccb08f57fcb..f67ceb7768b8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -32,8 +32,21 @@
32 32
33#include "power.h" 33#include "power.h"
34 34
35const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; 35const char * const pm_labels[] = {
36 [PM_SUSPEND_FREEZE] = "freeze",
37 [PM_SUSPEND_STANDBY] = "standby",
38 [PM_SUSPEND_MEM] = "mem",
39};
36const char *pm_states[PM_SUSPEND_MAX]; 40const char *pm_states[PM_SUSPEND_MAX];
41static const char * const mem_sleep_labels[] = {
42 [PM_SUSPEND_FREEZE] = "s2idle",
43 [PM_SUSPEND_STANDBY] = "shallow",
44 [PM_SUSPEND_MEM] = "deep",
45};
46const char *mem_sleep_states[PM_SUSPEND_MAX];
47
48suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
49suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
37 50
38unsigned int pm_suspend_global_flags; 51unsigned int pm_suspend_global_flags;
39EXPORT_SYMBOL_GPL(pm_suspend_global_flags); 52EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state)
110 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 123 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
111} 124}
112 125
113/*
114 * If this is set, the "mem" label always corresponds to the deepest sleep state
115 * available, the "standby" label corresponds to the second deepest sleep state
116 * available (if any), and the "freeze" label corresponds to the remaining
117 * available sleep state (if there is one).
118 */
119static bool relative_states;
120
121void __init pm_states_init(void) 126void __init pm_states_init(void)
122{ 127{
128 /* "mem" and "freeze" are always present in /sys/power/state. */
129 pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
130 pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
123 /* 131 /*
124 * freeze state should be supported even without any suspend_ops, 132 * Suspend-to-idle should be supported even without any suspend_ops,
125 * initialize pm_states accordingly here 133 * initialize mem_sleep_states[] accordingly here.
126 */ 134 */
127 pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; 135 mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
128} 136}
129 137
130static int __init sleep_states_setup(char *str) 138static int __init mem_sleep_default_setup(char *str)
131{ 139{
132 relative_states = !strncmp(str, "1", 1); 140 suspend_state_t state;
141
142 for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
143 if (mem_sleep_labels[state] &&
144 !strcmp(str, mem_sleep_labels[state])) {
145 mem_sleep_default = state;
146 break;
147 }
148
133 return 1; 149 return 1;
134} 150}
135 151__setup("mem_sleep_default=", mem_sleep_default_setup);
136__setup("relative_sleep_states=", sleep_states_setup);
137 152
138/** 153/**
139 * suspend_set_ops - Set the global suspend method table. 154 * suspend_set_ops - Set the global suspend method table.
@@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup);
141 */ 156 */
142void suspend_set_ops(const struct platform_suspend_ops *ops) 157void suspend_set_ops(const struct platform_suspend_ops *ops)
143{ 158{
144 suspend_state_t i;
145 int j = 0;
146
147 lock_system_sleep(); 159 lock_system_sleep();
148 160
149 suspend_ops = ops; 161 suspend_ops = ops;
150 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
151 if (valid_state(i)) {
152 pm_states[i] = pm_labels[j++];
153 } else if (!relative_states) {
154 pm_states[i] = NULL;
155 j++;
156 }
157 162
158 pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; 163 if (valid_state(PM_SUSPEND_STANDBY)) {
164 mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY];
165 pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY];
166 if (mem_sleep_default == PM_SUSPEND_STANDBY)
167 mem_sleep_current = PM_SUSPEND_STANDBY;
168 }
169 if (valid_state(PM_SUSPEND_MEM)) {
170 mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
171 if (mem_sleep_default >= PM_SUSPEND_MEM)
172 mem_sleep_current = PM_SUSPEND_MEM;
173 }
159 174
160 unlock_system_sleep(); 175 unlock_system_sleep();
161} 176}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d18804491d9f..966556ebdbb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5280,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu)
5280 __sched_fork(0, idle); 5280 __sched_fork(0, idle);
5281 idle->state = TASK_RUNNING; 5281 idle->state = TASK_RUNNING;
5282 idle->se.exec_start = sched_clock(); 5282 idle->se.exec_start = sched_clock();
5283 idle->flags |= PF_IDLE;
5283 5284
5284 kasan_unpoison_task_stack(idle); 5285 kasan_unpoison_task_stack(idle);
5285 5286
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e06898997d..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,11 +12,14 @@
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/kthread.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <trace/events/power.h> 17#include <trace/events/power.h>
17 18
18#include "sched.h" 19#include "sched.h"
19 20
21#define SUGOV_KTHREAD_PRIORITY 50
22
20struct sugov_tunables { 23struct sugov_tunables {
21 struct gov_attr_set attr_set; 24 struct gov_attr_set attr_set;
22 unsigned int rate_limit_us; 25 unsigned int rate_limit_us;
@@ -35,8 +38,10 @@ struct sugov_policy {
35 38
36 /* The next fields are only needed if fast switch cannot be used. */ 39 /* The next fields are only needed if fast switch cannot be used. */
37 struct irq_work irq_work; 40 struct irq_work irq_work;
38 struct work_struct work; 41 struct kthread_work work;
39 struct mutex work_lock; 42 struct mutex work_lock;
43 struct kthread_worker worker;
44 struct task_struct *thread;
40 bool work_in_progress; 45 bool work_in_progress;
41 46
42 bool need_freq_update; 47 bool need_freq_update;
@@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
291 raw_spin_unlock(&sg_policy->update_lock); 296 raw_spin_unlock(&sg_policy->update_lock);
292} 297}
293 298
294static void sugov_work(struct work_struct *work) 299static void sugov_work(struct kthread_work *work)
295{ 300{
296 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 301 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
297 302
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
308 struct sugov_policy *sg_policy; 313 struct sugov_policy *sg_policy;
309 314
310 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 315 sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
311 schedule_work_on(smp_processor_id(), &sg_policy->work); 316
317 /*
318 * For RT and deadline tasks, the schedutil governor shoots the
319 * frequency to maximum. Special care must be taken to ensure that this
320 * kthread doesn't result in the same behavior.
321 *
322 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
323 * updated only at the end of the sugov_work() function and before that
324 * the schedutil governor rejects all other frequency scaling requests.
325 *
326 * There is a very rare case though, where the RT thread yields right
327 * after the work_in_progress flag is cleared. The effects of that are
328 * neglected for now.
329 */
330 kthread_queue_work(&sg_policy->worker, &sg_policy->work);
312} 331}
313 332
314/************************** sysfs interface ************************/ 333/************************** sysfs interface ************************/
@@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
371 return NULL; 390 return NULL;
372 391
373 sg_policy->policy = policy; 392 sg_policy->policy = policy;
374 init_irq_work(&sg_policy->irq_work, sugov_irq_work);
375 INIT_WORK(&sg_policy->work, sugov_work);
376 mutex_init(&sg_policy->work_lock);
377 raw_spin_lock_init(&sg_policy->update_lock); 393 raw_spin_lock_init(&sg_policy->update_lock);
378 return sg_policy; 394 return sg_policy;
379} 395}
380 396
381static void sugov_policy_free(struct sugov_policy *sg_policy) 397static void sugov_policy_free(struct sugov_policy *sg_policy)
382{ 398{
383 mutex_destroy(&sg_policy->work_lock);
384 kfree(sg_policy); 399 kfree(sg_policy);
385} 400}
386 401
402static int sugov_kthread_create(struct sugov_policy *sg_policy)
403{
404 struct task_struct *thread;
405 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
406 struct cpufreq_policy *policy = sg_policy->policy;
407 int ret;
408
409 /* kthread only required for slow path */
410 if (policy->fast_switch_enabled)
411 return 0;
412
413 kthread_init_work(&sg_policy->work, sugov_work);
414 kthread_init_worker(&sg_policy->worker);
415 thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
416 "sugov:%d",
417 cpumask_first(policy->related_cpus));
418 if (IS_ERR(thread)) {
419 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
420 return PTR_ERR(thread);
421 }
422
423 ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
424 if (ret) {
425 kthread_stop(thread);
426 pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
427 return ret;
428 }
429
430 sg_policy->thread = thread;
431 kthread_bind_mask(thread, policy->related_cpus);
432 init_irq_work(&sg_policy->irq_work, sugov_irq_work);
433 mutex_init(&sg_policy->work_lock);
434
435 wake_up_process(thread);
436
437 return 0;
438}
439
440static void sugov_kthread_stop(struct sugov_policy *sg_policy)
441{
442 /* kthread only required for slow path */
443 if (sg_policy->policy->fast_switch_enabled)
444 return;
445
446 kthread_flush_worker(&sg_policy->worker);
447 kthread_stop(sg_policy->thread);
448 mutex_destroy(&sg_policy->work_lock);
449}
450
387static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 451static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
388{ 452{
389 struct sugov_tunables *tunables; 453 struct sugov_tunables *tunables;
@@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy)
416 if (policy->governor_data) 480 if (policy->governor_data)
417 return -EBUSY; 481 return -EBUSY;
418 482
483 cpufreq_enable_fast_switch(policy);
484
419 sg_policy = sugov_policy_alloc(policy); 485 sg_policy = sugov_policy_alloc(policy);
420 if (!sg_policy) 486 if (!sg_policy) {
421 return -ENOMEM; 487 ret = -ENOMEM;
488 goto disable_fast_switch;
489 }
490
491 ret = sugov_kthread_create(sg_policy);
492 if (ret)
493 goto free_sg_policy;
422 494
423 mutex_lock(&global_tunables_lock); 495 mutex_lock(&global_tunables_lock);
424 496
425 if (global_tunables) { 497 if (global_tunables) {
426 if (WARN_ON(have_governor_per_policy())) { 498 if (WARN_ON(have_governor_per_policy())) {
427 ret = -EINVAL; 499 ret = -EINVAL;
428 goto free_sg_policy; 500 goto stop_kthread;
429 } 501 }
430 policy->governor_data = sg_policy; 502 policy->governor_data = sg_policy;
431 sg_policy->tunables = global_tunables; 503 sg_policy->tunables = global_tunables;
@@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy)
437 tunables = sugov_tunables_alloc(sg_policy); 509 tunables = sugov_tunables_alloc(sg_policy);
438 if (!tunables) { 510 if (!tunables) {
439 ret = -ENOMEM; 511 ret = -ENOMEM;
440 goto free_sg_policy; 512 goto stop_kthread;
441 } 513 }
442 514
443 tunables->rate_limit_us = LATENCY_MULTIPLIER; 515 tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy)
454 if (ret) 526 if (ret)
455 goto fail; 527 goto fail;
456 528
457 out: 529out:
458 mutex_unlock(&global_tunables_lock); 530 mutex_unlock(&global_tunables_lock);
459
460 cpufreq_enable_fast_switch(policy);
461 return 0; 531 return 0;
462 532
463 fail: 533fail:
464 policy->governor_data = NULL; 534 policy->governor_data = NULL;
465 sugov_tunables_free(tunables); 535 sugov_tunables_free(tunables);
466 536
467 free_sg_policy: 537stop_kthread:
538 sugov_kthread_stop(sg_policy);
539
540free_sg_policy:
468 mutex_unlock(&global_tunables_lock); 541 mutex_unlock(&global_tunables_lock);
469 542
470 sugov_policy_free(sg_policy); 543 sugov_policy_free(sg_policy);
544
545disable_fast_switch:
546 cpufreq_disable_fast_switch(policy);
547
471 pr_err("initialization failed (error %d)\n", ret); 548 pr_err("initialization failed (error %d)\n", ret);
472 return ret; 549 return ret;
473} 550}
@@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
478 struct sugov_tunables *tunables = sg_policy->tunables; 555 struct sugov_tunables *tunables = sg_policy->tunables;
479 unsigned int count; 556 unsigned int count;
480 557
481 cpufreq_disable_fast_switch(policy);
482
483 mutex_lock(&global_tunables_lock); 558 mutex_lock(&global_tunables_lock);
484 559
485 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 560 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy)
489 564
490 mutex_unlock(&global_tunables_lock); 565 mutex_unlock(&global_tunables_lock);
491 566
567 sugov_kthread_stop(sg_policy);
492 sugov_policy_free(sg_policy); 568 sugov_policy_free(sg_policy);
569 cpufreq_disable_fast_switch(policy);
493} 570}
494 571
495static int sugov_start(struct cpufreq_policy *policy) 572static int sugov_start(struct cpufreq_policy *policy)
@@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
535 612
536 synchronize_sched(); 613 synchronize_sched();
537 614
538 irq_work_sync(&sg_policy->irq_work); 615 if (!policy->fast_switch_enabled) {
539 cancel_work_sync(&sg_policy->work); 616 irq_work_sync(&sg_policy->irq_work);
617 kthread_cancel_work_sync(&sg_policy->work);
618 }
540} 619}
541 620
542static void sugov_limits(struct cpufreq_policy *policy) 621static void sugov_limits(struct cpufreq_policy *policy)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)
164 * timekeeping to prevent timer interrupts from kicking us out of idle 164 * timekeeping to prevent timer interrupts from kicking us out of idle
165 * until a proper wakeup interrupt happens. 165 * until a proper wakeup interrupt happens.
166 */ 166 */
167 if (idle_should_freeze()) { 167
168 entered_state = cpuidle_enter_freeze(drv, dev); 168 if (idle_should_freeze() || dev->use_deepest_state) {
169 if (entered_state > 0) { 169 if (idle_should_freeze()) {
170 local_irq_enable(); 170 entered_state = cpuidle_enter_freeze(drv, dev);
171 goto exit_idle; 171 if (entered_state > 0) {
172 local_irq_enable();
173 goto exit_idle;
174 }
172 } 175 }
173 176
174 next_state = cpuidle_find_deepest_state(drv, dev); 177 next_state = cpuidle_find_deepest_state(drv, dev);
@@ -202,76 +205,65 @@ exit_idle:
202 * 205 *
203 * Called with polling cleared. 206 * Called with polling cleared.
204 */ 207 */
205static void cpu_idle_loop(void) 208static void do_idle(void)
206{ 209{
207 int cpu = smp_processor_id(); 210 /*
208 211 * If the arch has a polling bit, we maintain an invariant:
209 while (1) { 212 *
210 /* 213 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
211 * If the arch has a polling bit, we maintain an invariant: 214 * rq->idle). This means that, if rq->idle has the polling bit set,
212 * 215 * then setting need_resched is guaranteed to cause the CPU to
213 * Our polling bit is clear if we're not scheduled (i.e. if 216 * reschedule.
214 * rq->curr != rq->idle). This means that, if rq->idle has 217 */
215 * the polling bit set, then setting need_resched is
216 * guaranteed to cause the cpu to reschedule.
217 */
218
219 __current_set_polling();
220 quiet_vmstat();
221 tick_nohz_idle_enter();
222 218
223 while (!need_resched()) { 219 __current_set_polling();
224 check_pgt_cache(); 220 tick_nohz_idle_enter();
225 rmb();
226 221
227 if (cpu_is_offline(cpu)) { 222 while (!need_resched()) {
228 cpuhp_report_idle_dead(); 223 check_pgt_cache();
229 arch_cpu_idle_dead(); 224 rmb();
230 }
231 225
232 local_irq_disable(); 226 if (cpu_is_offline(smp_processor_id())) {
233 arch_cpu_idle_enter(); 227 cpuhp_report_idle_dead();
234 228 arch_cpu_idle_dead();
235 /*
236 * In poll mode we reenable interrupts and spin.
237 *
238 * Also if we detected in the wakeup from idle
239 * path that the tick broadcast device expired
240 * for us, we don't want to go deep idle as we
241 * know that the IPI is going to arrive right
242 * away
243 */
244 if (cpu_idle_force_poll || tick_check_broadcast_expired())
245 cpu_idle_poll();
246 else
247 cpuidle_idle_call();
248
249 arch_cpu_idle_exit();
250 } 229 }
251 230
252 /* 231 local_irq_disable();
253 * Since we fell out of the loop above, we know 232 arch_cpu_idle_enter();
254 * TIF_NEED_RESCHED must be set, propagate it into
255 * PREEMPT_NEED_RESCHED.
256 *
257 * This is required because for polling idle loops we will
258 * not have had an IPI to fold the state for us.
259 */
260 preempt_set_need_resched();
261 tick_nohz_idle_exit();
262 __current_clr_polling();
263 233
264 /* 234 /*
265 * We promise to call sched_ttwu_pending and reschedule 235 * In poll mode we reenable interrupts and spin. Also if we
266 * if need_resched is set while polling is set. That 236 * detected in the wakeup from idle path that the tick
267 * means that clearing polling needs to be visible 237 * broadcast device expired for us, we don't want to go deep
268 * before doing these things. 238 * idle as we know that the IPI is going to arrive right away.
269 */ 239 */
270 smp_mb__after_atomic(); 240 if (cpu_idle_force_poll || tick_check_broadcast_expired())
271 241 cpu_idle_poll();
272 sched_ttwu_pending(); 242 else
273 schedule_preempt_disabled(); 243 cpuidle_idle_call();
244 arch_cpu_idle_exit();
274 } 245 }
246
247 /*
248 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
249 * be set, propagate it into PREEMPT_NEED_RESCHED.
250 *
251 * This is required because for polling idle loops we will not have had
252 * an IPI to fold the state for us.
253 */
254 preempt_set_need_resched();
255 tick_nohz_idle_exit();
256 __current_clr_polling();
257
258 /*
259 * We promise to call sched_ttwu_pending() and reschedule if
260 * need_resched() is set while polling is set. That means that clearing
261 * polling needs to be visible before doing these things.
262 */
263 smp_mb__after_atomic();
264
265 sched_ttwu_pending();
266 schedule_preempt_disabled();
275} 267}
276 268
277bool cpu_in_idle(unsigned long pc) 269bool cpu_in_idle(unsigned long pc)
@@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
280 pc < (unsigned long)__cpuidle_text_end; 272 pc < (unsigned long)__cpuidle_text_end;
281} 273}
282 274
275struct idle_timer {
276 struct hrtimer timer;
277 int done;
278};
279
280static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
281{
282 struct idle_timer *it = container_of(timer, struct idle_timer, timer);
283
284 WRITE_ONCE(it->done, 1);
285 set_tsk_need_resched(current);
286
287 return HRTIMER_NORESTART;
288}
289
290void play_idle(unsigned long duration_ms)
291{
292 struct idle_timer it;
293
294 /*
295 * Only FIFO tasks can disable the tick since they don't need the forced
296 * preemption.
297 */
298 WARN_ON_ONCE(current->policy != SCHED_FIFO);
299 WARN_ON_ONCE(current->nr_cpus_allowed != 1);
300 WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
301 WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
302 WARN_ON_ONCE(!duration_ms);
303
304 rcu_sleep_check();
305 preempt_disable();
306 current->flags |= PF_IDLE;
307 cpuidle_use_deepest_state(true);
308
309 it.done = 0;
310 hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
311 it.timer.function = idle_inject_timer_fn;
312 hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
313
314 while (!READ_ONCE(it.done))
315 do_idle();
316
317 cpuidle_use_deepest_state(false);
318 current->flags &= ~PF_IDLE;
319
320 preempt_fold_need_resched();
321 preempt_enable();
322}
323EXPORT_SYMBOL_GPL(play_idle);
324
283void cpu_startup_entry(enum cpuhp_state state) 325void cpu_startup_entry(enum cpuhp_state state)
284{ 326{
285 /* 327 /*
@@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
299#endif 341#endif
300 arch_cpu_idle_prepare(); 342 arch_cpu_idle_prepare();
301 cpuhp_online_idle(state); 343 cpuhp_online_idle(state);
302 cpu_idle_loop(); 344 while (1)
345 do_idle();
303} 346}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..b2a0cff2bb35 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
80/* Unpoison the stack for the current task beyond a watermark sp value. */ 80/* Unpoison the stack for the current task beyond a watermark sp value. */
81asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) 81asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
82{ 82{
83 __kasan_unpoison_stack(current, watermark); 83 /*
84 * Calculate the task stack base address. Avoid using 'current'
85 * because this function is called by early resume code which hasn't
86 * yet set up the percpu register (%gs).
87 */
88 void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
89
90 kasan_unpoison_shadow(base, watermark - base);
84} 91}
85 92
86/* 93/*