aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpu-freq/intel-pstate.txt2
-rw-r--r--drivers/cpufreq/Kconfig1
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c212
-rw-r--r--drivers/cpufreq/amd_freq_sensitivity.c8
-rw-r--r--drivers/cpufreq/cpufreq-dt.c300
-rw-r--r--drivers/cpufreq/cpufreq.c333
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c282
-rw-r--r--drivers/cpufreq/cpufreq_governor.c766
-rw-r--r--drivers/cpufreq/cpufreq_governor.h261
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c445
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.h30
-rw-r--r--drivers/cpufreq/cpufreq_performance.c18
-rw-r--r--drivers/cpufreq/cpufreq_powersave.c10
-rw-r--r--drivers/cpufreq/cpufreq_userspace.c10
-rw-r--r--drivers/cpufreq/intel_pstate.c192
-rw-r--r--drivers/cpufreq/powernv-cpufreq.c152
-rw-r--r--include/linux/cpufreq.h47
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/trace/events/power.h22
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/cpufreq.c37
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/fair.c26
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/sched.h48
-rw-r--r--kernel/trace/power-traces.c1
26 files changed, 1503 insertions, 1718 deletions
diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index f7b12c071d53..e6bd1e6512a5 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -25,7 +25,7 @@ callback, so cpufreq core can't request a transition to a specific frequency.
25The driver provides minimum and maximum frequency limits and callbacks to set a 25The driver provides minimum and maximum frequency limits and callbacks to set a
26policy. The policy in cpufreq sysfs is referred to as the "scaling governor". 26policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
27The cpufreq core can request the driver to operate in any of the two policies: 27The cpufreq core can request the driver to operate in any of the two policies:
28"performance: and "powersave". The driver decides which frequency to use based 28"performance" and "powersave". The driver decides which frequency to use based
29on the above policy selection considering minimum and maximum frequency limits. 29on the above policy selection considering minimum and maximum frequency limits.
30 30
31The Intel P-State driver falls under the latter category, which implements the 31The Intel P-State driver falls under the latter category, which implements the
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index f93511031177..a7f45853c103 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -19,6 +19,7 @@ config CPU_FREQ
19if CPU_FREQ 19if CPU_FREQ
20 20
21config CPU_FREQ_GOV_COMMON 21config CPU_FREQ_GOV_COMMON
22 select IRQ_WORK
22 bool 23 bool
23 24
24config CPU_FREQ_BOOST_SW 25config CPU_FREQ_BOOST_SW
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 51eef87bbc37..59a7b380fbe2 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
70 unsigned int cpu_feature; 70 unsigned int cpu_feature;
71 unsigned int acpi_perf_cpu; 71 unsigned int acpi_perf_cpu;
72 cpumask_var_t freqdomain_cpus; 72 cpumask_var_t freqdomain_cpus;
73 void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val);
74 u32 (*cpu_freq_read)(struct acpi_pct_register *reg);
73}; 75};
74 76
75/* acpi_perf_data is a pointer to percpu data. */ 77/* acpi_perf_data is a pointer to percpu data. */
@@ -243,125 +245,119 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
243 } 245 }
244} 246}
245 247
246struct msr_addr { 248u32 cpu_freq_read_intel(struct acpi_pct_register *not_used)
247 u32 reg; 249{
248}; 250 u32 val, dummy;
249 251
250struct io_addr { 252 rdmsr(MSR_IA32_PERF_CTL, val, dummy);
251 u16 port; 253 return val;
252 u8 bit_width; 254}
253}; 255
256void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val)
257{
258 u32 lo, hi;
259
260 rdmsr(MSR_IA32_PERF_CTL, lo, hi);
261 lo = (lo & ~INTEL_MSR_RANGE) | (val & INTEL_MSR_RANGE);
262 wrmsr(MSR_IA32_PERF_CTL, lo, hi);
263}
264
265u32 cpu_freq_read_amd(struct acpi_pct_register *not_used)
266{
267 u32 val, dummy;
268
269 rdmsr(MSR_AMD_PERF_CTL, val, dummy);
270 return val;
271}
272
273void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val)
274{
275 wrmsr(MSR_AMD_PERF_CTL, val, 0);
276}
277
278u32 cpu_freq_read_io(struct acpi_pct_register *reg)
279{
280 u32 val;
281
282 acpi_os_read_port(reg->address, &val, reg->bit_width);
283 return val;
284}
285
286void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val)
287{
288 acpi_os_write_port(reg->address, val, reg->bit_width);
289}
254 290
255struct drv_cmd { 291struct drv_cmd {
256 unsigned int type; 292 struct acpi_pct_register *reg;
257 const struct cpumask *mask;
258 union {
259 struct msr_addr msr;
260 struct io_addr io;
261 } addr;
262 u32 val; 293 u32 val;
294 union {
295 void (*write)(struct acpi_pct_register *reg, u32 val);
296 u32 (*read)(struct acpi_pct_register *reg);
297 } func;
263}; 298};
264 299
265/* Called via smp_call_function_single(), on the target CPU */ 300/* Called via smp_call_function_single(), on the target CPU */
266static void do_drv_read(void *_cmd) 301static void do_drv_read(void *_cmd)
267{ 302{
268 struct drv_cmd *cmd = _cmd; 303 struct drv_cmd *cmd = _cmd;
269 u32 h;
270 304
271 switch (cmd->type) { 305 cmd->val = cmd->func.read(cmd->reg);
272 case SYSTEM_INTEL_MSR_CAPABLE:
273 case SYSTEM_AMD_MSR_CAPABLE:
274 rdmsr(cmd->addr.msr.reg, cmd->val, h);
275 break;
276 case SYSTEM_IO_CAPABLE:
277 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
278 &cmd->val,
279 (u32)cmd->addr.io.bit_width);
280 break;
281 default:
282 break;
283 }
284} 306}
285 307
286/* Called via smp_call_function_many(), on the target CPUs */ 308static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask)
287static void do_drv_write(void *_cmd)
288{ 309{
289 struct drv_cmd *cmd = _cmd; 310 struct acpi_processor_performance *perf = to_perf_data(data);
290 u32 lo, hi; 311 struct drv_cmd cmd = {
312 .reg = &perf->control_register,
313 .func.read = data->cpu_freq_read,
314 };
315 int err;
291 316
292 switch (cmd->type) { 317 err = smp_call_function_any(mask, do_drv_read, &cmd, 1);
293 case SYSTEM_INTEL_MSR_CAPABLE: 318 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
294 rdmsr(cmd->addr.msr.reg, lo, hi); 319 return cmd.val;
295 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
296 wrmsr(cmd->addr.msr.reg, lo, hi);
297 break;
298 case SYSTEM_AMD_MSR_CAPABLE:
299 wrmsr(cmd->addr.msr.reg, cmd->val, 0);
300 break;
301 case SYSTEM_IO_CAPABLE:
302 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
303 cmd->val,
304 (u32)cmd->addr.io.bit_width);
305 break;
306 default:
307 break;
308 }
309} 320}
310 321
311static void drv_read(struct drv_cmd *cmd) 322/* Called via smp_call_function_many(), on the target CPUs */
323static void do_drv_write(void *_cmd)
312{ 324{
313 int err; 325 struct drv_cmd *cmd = _cmd;
314 cmd->val = 0;
315 326
316 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); 327 cmd->func.write(cmd->reg, cmd->val);
317 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
318} 328}
319 329
320static void drv_write(struct drv_cmd *cmd) 330static void drv_write(struct acpi_cpufreq_data *data,
331 const struct cpumask *mask, u32 val)
321{ 332{
333 struct acpi_processor_performance *perf = to_perf_data(data);
334 struct drv_cmd cmd = {
335 .reg = &perf->control_register,
336 .val = val,
337 .func.write = data->cpu_freq_write,
338 };
322 int this_cpu; 339 int this_cpu;
323 340
324 this_cpu = get_cpu(); 341 this_cpu = get_cpu();
325 if (cpumask_test_cpu(this_cpu, cmd->mask)) 342 if (cpumask_test_cpu(this_cpu, mask))
326 do_drv_write(cmd); 343 do_drv_write(&cmd);
327 smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); 344
345 smp_call_function_many(mask, do_drv_write, &cmd, 1);
328 put_cpu(); 346 put_cpu();
329} 347}
330 348
331static u32 349static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
332get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data)
333{ 350{
334 struct acpi_processor_performance *perf; 351 u32 val;
335 struct drv_cmd cmd;
336 352
337 if (unlikely(cpumask_empty(mask))) 353 if (unlikely(cpumask_empty(mask)))
338 return 0; 354 return 0;
339 355
340 switch (data->cpu_feature) { 356 val = drv_read(data, mask);
341 case SYSTEM_INTEL_MSR_CAPABLE:
342 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
343 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
344 break;
345 case SYSTEM_AMD_MSR_CAPABLE:
346 cmd.type = SYSTEM_AMD_MSR_CAPABLE;
347 cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
348 break;
349 case SYSTEM_IO_CAPABLE:
350 cmd.type = SYSTEM_IO_CAPABLE;
351 perf = to_perf_data(data);
352 cmd.addr.io.port = perf->control_register.address;
353 cmd.addr.io.bit_width = perf->control_register.bit_width;
354 break;
355 default:
356 return 0;
357 }
358
359 cmd.mask = mask;
360 drv_read(&cmd);
361 357
362 pr_debug("get_cur_val = %u\n", cmd.val); 358 pr_debug("get_cur_val = %u\n", val);
363 359
364 return cmd.val; 360 return val;
365} 361}
366 362
367static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 363static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
@@ -416,7 +412,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
416{ 412{
417 struct acpi_cpufreq_data *data = policy->driver_data; 413 struct acpi_cpufreq_data *data = policy->driver_data;
418 struct acpi_processor_performance *perf; 414 struct acpi_processor_performance *perf;
419 struct drv_cmd cmd; 415 const struct cpumask *mask;
420 unsigned int next_perf_state = 0; /* Index into perf table */ 416 unsigned int next_perf_state = 0; /* Index into perf table */
421 int result = 0; 417 int result = 0;
422 418
@@ -434,42 +430,21 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
434 } else { 430 } else {
435 pr_debug("Already at target state (P%d)\n", 431 pr_debug("Already at target state (P%d)\n",
436 next_perf_state); 432 next_perf_state);
437 goto out; 433 return 0;
438 } 434 }
439 } 435 }
440 436
441 switch (data->cpu_feature) { 437 /*
442 case SYSTEM_INTEL_MSR_CAPABLE: 438 * The core won't allow CPUs to go away until the governor has been
443 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 439 * stopped, so we can rely on the stability of policy->cpus.
444 cmd.addr.msr.reg = MSR_IA32_PERF_CTL; 440 */
445 cmd.val = (u32) perf->states[next_perf_state].control; 441 mask = policy->shared_type == CPUFREQ_SHARED_TYPE_ANY ?
446 break; 442 cpumask_of(policy->cpu) : policy->cpus;
447 case SYSTEM_AMD_MSR_CAPABLE:
448 cmd.type = SYSTEM_AMD_MSR_CAPABLE;
449 cmd.addr.msr.reg = MSR_AMD_PERF_CTL;
450 cmd.val = (u32) perf->states[next_perf_state].control;
451 break;
452 case SYSTEM_IO_CAPABLE:
453 cmd.type = SYSTEM_IO_CAPABLE;
454 cmd.addr.io.port = perf->control_register.address;
455 cmd.addr.io.bit_width = perf->control_register.bit_width;
456 cmd.val = (u32) perf->states[next_perf_state].control;
457 break;
458 default:
459 result = -ENODEV;
460 goto out;
461 }
462
463 /* cpufreq holds the hotplug lock, so we are safe from here on */
464 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
465 cmd.mask = policy->cpus;
466 else
467 cmd.mask = cpumask_of(policy->cpu);
468 443
469 drv_write(&cmd); 444 drv_write(data, mask, perf->states[next_perf_state].control);
470 445
471 if (acpi_pstate_strict) { 446 if (acpi_pstate_strict) {
472 if (!check_freqs(cmd.mask, data->freq_table[index].frequency, 447 if (!check_freqs(mask, data->freq_table[index].frequency,
473 data)) { 448 data)) {
474 pr_debug("acpi_cpufreq_target failed (%d)\n", 449 pr_debug("acpi_cpufreq_target failed (%d)\n",
475 policy->cpu); 450 policy->cpu);
@@ -480,7 +455,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
480 if (!result) 455 if (!result)
481 perf->state = next_perf_state; 456 perf->state = next_perf_state;
482 457
483out:
484 return result; 458 return result;
485} 459}
486 460
@@ -740,15 +714,21 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
740 } 714 }
741 pr_debug("SYSTEM IO addr space\n"); 715 pr_debug("SYSTEM IO addr space\n");
742 data->cpu_feature = SYSTEM_IO_CAPABLE; 716 data->cpu_feature = SYSTEM_IO_CAPABLE;
717 data->cpu_freq_read = cpu_freq_read_io;
718 data->cpu_freq_write = cpu_freq_write_io;
743 break; 719 break;
744 case ACPI_ADR_SPACE_FIXED_HARDWARE: 720 case ACPI_ADR_SPACE_FIXED_HARDWARE:
745 pr_debug("HARDWARE addr space\n"); 721 pr_debug("HARDWARE addr space\n");
746 if (check_est_cpu(cpu)) { 722 if (check_est_cpu(cpu)) {
747 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; 723 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
724 data->cpu_freq_read = cpu_freq_read_intel;
725 data->cpu_freq_write = cpu_freq_write_intel;
748 break; 726 break;
749 } 727 }
750 if (check_amd_hwpstate_cpu(cpu)) { 728 if (check_amd_hwpstate_cpu(cpu)) {
751 data->cpu_feature = SYSTEM_AMD_MSR_CAPABLE; 729 data->cpu_feature = SYSTEM_AMD_MSR_CAPABLE;
730 data->cpu_freq_read = cpu_freq_read_amd;
731 data->cpu_freq_write = cpu_freq_write_amd;
752 break; 732 break;
753 } 733 }
754 result = -ENODEV; 734 result = -ENODEV;
diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index f6b79ab0070b..404360cad25c 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -21,7 +21,7 @@
21#include <asm/msr.h> 21#include <asm/msr.h>
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23 23
24#include "cpufreq_governor.h" 24#include "cpufreq_ondemand.h"
25 25
26#define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL 0xc0010080 26#define MSR_AMD64_FREQ_SENSITIVITY_ACTUAL 0xc0010080
27#define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE 0xc0010081 27#define MSR_AMD64_FREQ_SENSITIVITY_REFERENCE 0xc0010081
@@ -45,10 +45,10 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
45 long d_actual, d_reference; 45 long d_actual, d_reference;
46 struct msr actual, reference; 46 struct msr actual, reference;
47 struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu); 47 struct cpu_data_t *data = &per_cpu(cpu_data, policy->cpu);
48 struct dbs_data *od_data = policy->governor_data; 48 struct policy_dbs_info *policy_dbs = policy->governor_data;
49 struct dbs_data *od_data = policy_dbs->dbs_data;
49 struct od_dbs_tuners *od_tuners = od_data->tuners; 50 struct od_dbs_tuners *od_tuners = od_data->tuners;
50 struct od_cpu_dbs_info_s *od_info = 51 struct od_policy_dbs_info *od_info = to_dbs_info(policy_dbs);
51 od_data->cdata->get_cpu_dbs_info_s(policy->cpu);
52 52
53 if (!od_info->freq_table) 53 if (!od_info->freq_table)
54 return freq_next; 54 return freq_next;
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 0ca74d070058..f951f911786e 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -31,9 +31,8 @@
31 31
32struct private_data { 32struct private_data {
33 struct device *cpu_dev; 33 struct device *cpu_dev;
34 struct regulator *cpu_reg;
35 struct thermal_cooling_device *cdev; 34 struct thermal_cooling_device *cdev;
36 unsigned int voltage_tolerance; /* in percentage */ 35 const char *reg_name;
37}; 36};
38 37
39static struct freq_attr *cpufreq_dt_attr[] = { 38static struct freq_attr *cpufreq_dt_attr[] = {
@@ -44,175 +43,128 @@ static struct freq_attr *cpufreq_dt_attr[] = {
44 43
45static int set_target(struct cpufreq_policy *policy, unsigned int index) 44static int set_target(struct cpufreq_policy *policy, unsigned int index)
46{ 45{
47 struct dev_pm_opp *opp;
48 struct cpufreq_frequency_table *freq_table = policy->freq_table;
49 struct clk *cpu_clk = policy->clk;
50 struct private_data *priv = policy->driver_data; 46 struct private_data *priv = policy->driver_data;
51 struct device *cpu_dev = priv->cpu_dev;
52 struct regulator *cpu_reg = priv->cpu_reg;
53 unsigned long volt = 0, tol = 0;
54 int volt_old = 0;
55 unsigned int old_freq, new_freq;
56 long freq_Hz, freq_exact;
57 int ret;
58
59 freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
60 if (freq_Hz <= 0)
61 freq_Hz = freq_table[index].frequency * 1000;
62 47
63 freq_exact = freq_Hz; 48 return dev_pm_opp_set_rate(priv->cpu_dev,
64 new_freq = freq_Hz / 1000; 49 policy->freq_table[index].frequency * 1000);
65 old_freq = clk_get_rate(cpu_clk) / 1000; 50}
66 51
67 if (!IS_ERR(cpu_reg)) { 52/*
68 unsigned long opp_freq; 53 * An earlier version of opp-v1 bindings used to name the regulator
54 * "cpu0-supply", we still need to handle that for backwards compatibility.
55 */
56static const char *find_supply_name(struct device *dev)
57{
58 struct device_node *np;
59 struct property *pp;
60 int cpu = dev->id;
61 const char *name = NULL;
69 62
70 rcu_read_lock(); 63 np = of_node_get(dev->of_node);
71 opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
72 if (IS_ERR(opp)) {
73 rcu_read_unlock();
74 dev_err(cpu_dev, "failed to find OPP for %ld\n",
75 freq_Hz);
76 return PTR_ERR(opp);
77 }
78 volt = dev_pm_opp_get_voltage(opp);
79 opp_freq = dev_pm_opp_get_freq(opp);
80 rcu_read_unlock();
81 tol = volt * priv->voltage_tolerance / 100;
82 volt_old = regulator_get_voltage(cpu_reg);
83 dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
84 opp_freq / 1000, volt);
85 }
86 64
87 dev_dbg(cpu_dev, "%u MHz, %d mV --> %u MHz, %ld mV\n", 65 /* This must be valid for sure */
88 old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1, 66 if (WARN_ON(!np))
89 new_freq / 1000, volt ? volt / 1000 : -1); 67 return NULL;
90 68
91 /* scaling up? scale voltage before frequency */ 69 /* Try "cpu0" for older DTs */
92 if (!IS_ERR(cpu_reg) && new_freq > old_freq) { 70 if (!cpu) {
93 ret = regulator_set_voltage_tol(cpu_reg, volt, tol); 71 pp = of_find_property(np, "cpu0-supply", NULL);
94 if (ret) { 72 if (pp) {
95 dev_err(cpu_dev, "failed to scale voltage up: %d\n", 73 name = "cpu0";
96 ret); 74 goto node_put;
97 return ret;
98 } 75 }
99 } 76 }
100 77
101 ret = clk_set_rate(cpu_clk, freq_exact); 78 pp = of_find_property(np, "cpu-supply", NULL);
102 if (ret) { 79 if (pp) {
103 dev_err(cpu_dev, "failed to set clock rate: %d\n", ret); 80 name = "cpu";
104 if (!IS_ERR(cpu_reg) && volt_old > 0) 81 goto node_put;
105 regulator_set_voltage_tol(cpu_reg, volt_old, tol);
106 return ret;
107 } 82 }
108 83
109 /* scaling down? scale voltage after frequency */ 84 dev_dbg(dev, "no regulator for cpu%d\n", cpu);
110 if (!IS_ERR(cpu_reg) && new_freq < old_freq) { 85node_put:
111 ret = regulator_set_voltage_tol(cpu_reg, volt, tol); 86 of_node_put(np);
112 if (ret) { 87 return name;
113 dev_err(cpu_dev, "failed to scale voltage down: %d\n",
114 ret);
115 clk_set_rate(cpu_clk, old_freq * 1000);
116 }
117 }
118
119 return ret;
120} 88}
121 89
122static int allocate_resources(int cpu, struct device **cdev, 90static int resources_available(void)
123 struct regulator **creg, struct clk **cclk)
124{ 91{
125 struct device *cpu_dev; 92 struct device *cpu_dev;
126 struct regulator *cpu_reg; 93 struct regulator *cpu_reg;
127 struct clk *cpu_clk; 94 struct clk *cpu_clk;
128 int ret = 0; 95 int ret = 0;
129 char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg; 96 const char *name;
130 97
131 cpu_dev = get_cpu_device(cpu); 98 cpu_dev = get_cpu_device(0);
132 if (!cpu_dev) { 99 if (!cpu_dev) {
133 pr_err("failed to get cpu%d device\n", cpu); 100 pr_err("failed to get cpu0 device\n");
134 return -ENODEV; 101 return -ENODEV;
135 } 102 }
136 103
137 /* Try "cpu0" for older DTs */ 104 cpu_clk = clk_get(cpu_dev, NULL);
138 if (!cpu) 105 ret = PTR_ERR_OR_ZERO(cpu_clk);
139 reg = reg_cpu0;
140 else
141 reg = reg_cpu;
142
143try_again:
144 cpu_reg = regulator_get_optional(cpu_dev, reg);
145 ret = PTR_ERR_OR_ZERO(cpu_reg);
146 if (ret) { 106 if (ret) {
147 /* 107 /*
148 * If cpu's regulator supply node is present, but regulator is 108 * If cpu's clk node is present, but clock is not yet
149 * not yet registered, we should try defering probe. 109 * registered, we should try defering probe.
150 */ 110 */
151 if (ret == -EPROBE_DEFER) { 111 if (ret == -EPROBE_DEFER)
152 dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n", 112 dev_dbg(cpu_dev, "clock not ready, retry\n");
153 cpu); 113 else
154 return ret; 114 dev_err(cpu_dev, "failed to get clock: %d\n", ret);
155 }
156
157 /* Try with "cpu-supply" */
158 if (reg == reg_cpu0) {
159 reg = reg_cpu;
160 goto try_again;
161 }
162 115
163 dev_dbg(cpu_dev, "no regulator for cpu%d: %d\n", cpu, ret); 116 return ret;
164 } 117 }
165 118
166 cpu_clk = clk_get(cpu_dev, NULL); 119 clk_put(cpu_clk);
167 ret = PTR_ERR_OR_ZERO(cpu_clk);
168 if (ret) {
169 /* put regulator */
170 if (!IS_ERR(cpu_reg))
171 regulator_put(cpu_reg);
172 120
121 name = find_supply_name(cpu_dev);
122 /* Platform doesn't require regulator */
123 if (!name)
124 return 0;
125
126 cpu_reg = regulator_get_optional(cpu_dev, name);
127 ret = PTR_ERR_OR_ZERO(cpu_reg);
128 if (ret) {
173 /* 129 /*
174 * If cpu's clk node is present, but clock is not yet 130 * If cpu's regulator supply node is present, but regulator is
175 * registered, we should try defering probe. 131 * not yet registered, we should try defering probe.
176 */ 132 */
177 if (ret == -EPROBE_DEFER) 133 if (ret == -EPROBE_DEFER)
178 dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu); 134 dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
179 else 135 else
180 dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu, 136 dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
181 ret); 137
182 } else { 138 return ret;
183 *cdev = cpu_dev;
184 *creg = cpu_reg;
185 *cclk = cpu_clk;
186 } 139 }
187 140
188 return ret; 141 regulator_put(cpu_reg);
142 return 0;
189} 143}
190 144
191static int cpufreq_init(struct cpufreq_policy *policy) 145static int cpufreq_init(struct cpufreq_policy *policy)
192{ 146{
193 struct cpufreq_frequency_table *freq_table; 147 struct cpufreq_frequency_table *freq_table;
194 struct device_node *np;
195 struct private_data *priv; 148 struct private_data *priv;
196 struct device *cpu_dev; 149 struct device *cpu_dev;
197 struct regulator *cpu_reg;
198 struct clk *cpu_clk; 150 struct clk *cpu_clk;
199 struct dev_pm_opp *suspend_opp; 151 struct dev_pm_opp *suspend_opp;
200 unsigned long min_uV = ~0, max_uV = 0;
201 unsigned int transition_latency; 152 unsigned int transition_latency;
202 bool need_update = false; 153 bool opp_v1 = false;
154 const char *name;
203 int ret; 155 int ret;
204 156
205 ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk); 157 cpu_dev = get_cpu_device(policy->cpu);
206 if (ret) { 158 if (!cpu_dev) {
207 pr_err("%s: Failed to allocate resources: %d\n", __func__, ret); 159 pr_err("failed to get cpu%d device\n", policy->cpu);
208 return ret; 160 return -ENODEV;
209 } 161 }
210 162
211 np = of_node_get(cpu_dev->of_node); 163 cpu_clk = clk_get(cpu_dev, NULL);
212 if (!np) { 164 if (IS_ERR(cpu_clk)) {
213 dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu); 165 ret = PTR_ERR(cpu_clk);
214 ret = -ENOENT; 166 dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
215 goto out_put_reg_clk; 167 return ret;
216 } 168 }
217 169
218 /* Get OPP-sharing information from "operating-points-v2" bindings */ 170 /* Get OPP-sharing information from "operating-points-v2" bindings */
@@ -223,9 +175,23 @@ static int cpufreq_init(struct cpufreq_policy *policy)
223 * finding shared-OPPs for backward compatibility. 175 * finding shared-OPPs for backward compatibility.
224 */ 176 */
225 if (ret == -ENOENT) 177 if (ret == -ENOENT)
226 need_update = true; 178 opp_v1 = true;
227 else 179 else
228 goto out_node_put; 180 goto out_put_clk;
181 }
182
183 /*
184 * OPP layer will be taking care of regulators now, but it needs to know
185 * the name of the regulator first.
186 */
187 name = find_supply_name(cpu_dev);
188 if (name) {
189 ret = dev_pm_opp_set_regulator(cpu_dev, name);
190 if (ret) {
191 dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
192 policy->cpu, ret);
193 goto out_put_clk;
194 }
229 } 195 }
230 196
231 /* 197 /*
@@ -246,12 +212,12 @@ static int cpufreq_init(struct cpufreq_policy *policy)
246 */ 212 */
247 ret = dev_pm_opp_get_opp_count(cpu_dev); 213 ret = dev_pm_opp_get_opp_count(cpu_dev);
248 if (ret <= 0) { 214 if (ret <= 0) {
249 pr_debug("OPP table is not ready, deferring probe\n"); 215 dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
250 ret = -EPROBE_DEFER; 216 ret = -EPROBE_DEFER;
251 goto out_free_opp; 217 goto out_free_opp;
252 } 218 }
253 219
254 if (need_update) { 220 if (opp_v1) {
255 struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data(); 221 struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
256 222
257 if (!pd || !pd->independent_clocks) 223 if (!pd || !pd->independent_clocks)
@@ -265,10 +231,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
265 if (ret) 231 if (ret)
266 dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n", 232 dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
267 __func__, ret); 233 __func__, ret);
268
269 of_property_read_u32(np, "clock-latency", &transition_latency);
270 } else {
271 transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
272 } 234 }
273 235
274 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 236 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -277,62 +239,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
277 goto out_free_opp; 239 goto out_free_opp;
278 } 240 }
279 241
280 of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance); 242 priv->reg_name = name;
281
282 if (!transition_latency)
283 transition_latency = CPUFREQ_ETERNAL;
284
285 if (!IS_ERR(cpu_reg)) {
286 unsigned long opp_freq = 0;
287
288 /*
289 * Disable any OPPs where the connected regulator isn't able to
290 * provide the specified voltage and record minimum and maximum
291 * voltage levels.
292 */
293 while (1) {
294 struct dev_pm_opp *opp;
295 unsigned long opp_uV, tol_uV;
296
297 rcu_read_lock();
298 opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
299 if (IS_ERR(opp)) {
300 rcu_read_unlock();
301 break;
302 }
303 opp_uV = dev_pm_opp_get_voltage(opp);
304 rcu_read_unlock();
305
306 tol_uV = opp_uV * priv->voltage_tolerance / 100;
307 if (regulator_is_supported_voltage(cpu_reg,
308 opp_uV - tol_uV,
309 opp_uV + tol_uV)) {
310 if (opp_uV < min_uV)
311 min_uV = opp_uV;
312 if (opp_uV > max_uV)
313 max_uV = opp_uV;
314 } else {
315 dev_pm_opp_disable(cpu_dev, opp_freq);
316 }
317
318 opp_freq++;
319 }
320
321 ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
322 if (ret > 0)
323 transition_latency += ret * 1000;
324 }
325 243
326 ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); 244 ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
327 if (ret) { 245 if (ret) {
328 pr_err("failed to init cpufreq table: %d\n", ret); 246 dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
329 goto out_free_priv; 247 goto out_free_priv;
330 } 248 }
331 249
332 priv->cpu_dev = cpu_dev; 250 priv->cpu_dev = cpu_dev;
333 priv->cpu_reg = cpu_reg;
334 policy->driver_data = priv; 251 policy->driver_data = priv;
335
336 policy->clk = cpu_clk; 252 policy->clk = cpu_clk;
337 253
338 rcu_read_lock(); 254 rcu_read_lock();
@@ -357,9 +273,11 @@ static int cpufreq_init(struct cpufreq_policy *policy)
357 cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs; 273 cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
358 } 274 }
359 275
360 policy->cpuinfo.transition_latency = transition_latency; 276 transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
277 if (!transition_latency)
278 transition_latency = CPUFREQ_ETERNAL;
361 279
362 of_node_put(np); 280 policy->cpuinfo.transition_latency = transition_latency;
363 281
364 return 0; 282 return 0;
365 283
@@ -369,12 +287,10 @@ out_free_priv:
369 kfree(priv); 287 kfree(priv);
370out_free_opp: 288out_free_opp:
371 dev_pm_opp_of_cpumask_remove_table(policy->cpus); 289 dev_pm_opp_of_cpumask_remove_table(policy->cpus);
372out_node_put: 290 if (name)
373 of_node_put(np); 291 dev_pm_opp_put_regulator(cpu_dev);
374out_put_reg_clk: 292out_put_clk:
375 clk_put(cpu_clk); 293 clk_put(cpu_clk);
376 if (!IS_ERR(cpu_reg))
377 regulator_put(cpu_reg);
378 294
379 return ret; 295 return ret;
380} 296}
@@ -386,9 +302,10 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
386 cpufreq_cooling_unregister(priv->cdev); 302 cpufreq_cooling_unregister(priv->cdev);
387 dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); 303 dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
388 dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); 304 dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
305 if (priv->reg_name)
306 dev_pm_opp_put_regulator(priv->cpu_dev);
307
389 clk_put(policy->clk); 308 clk_put(policy->clk);
390 if (!IS_ERR(priv->cpu_reg))
391 regulator_put(priv->cpu_reg);
392 kfree(priv); 309 kfree(priv);
393 310
394 return 0; 311 return 0;
@@ -441,9 +358,6 @@ static struct cpufreq_driver dt_cpufreq_driver = {
441 358
442static int dt_cpufreq_probe(struct platform_device *pdev) 359static int dt_cpufreq_probe(struct platform_device *pdev)
443{ 360{
444 struct device *cpu_dev;
445 struct regulator *cpu_reg;
446 struct clk *cpu_clk;
447 int ret; 361 int ret;
448 362
449 /* 363 /*
@@ -453,19 +367,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
453 * 367 *
454 * FIXME: Is checking this only for CPU0 sufficient ? 368 * FIXME: Is checking this only for CPU0 sufficient ?
455 */ 369 */
456 ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk); 370 ret = resources_available();
457 if (ret) 371 if (ret)
458 return ret; 372 return ret;
459 373
460 clk_put(cpu_clk);
461 if (!IS_ERR(cpu_reg))
462 regulator_put(cpu_reg);
463
464 dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev); 374 dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
465 375
466 ret = cpufreq_register_driver(&dt_cpufreq_driver); 376 ret = cpufreq_register_driver(&dt_cpufreq_driver);
467 if (ret) 377 if (ret)
468 dev_err(cpu_dev, "failed register driver: %d\n", ret); 378 dev_err(&pdev->dev, "failed register driver: %d\n", ret);
469 379
470 return ret; 380 return ret;
471} 381}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e979ec78b695..4c7825856eab 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -38,48 +38,10 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy)
38 return cpumask_empty(policy->cpus); 38 return cpumask_empty(policy->cpus);
39} 39}
40 40
41static bool suitable_policy(struct cpufreq_policy *policy, bool active)
42{
43 return active == !policy_is_inactive(policy);
44}
45
46/* Finds Next Acive/Inactive policy */
47static struct cpufreq_policy *next_policy(struct cpufreq_policy *policy,
48 bool active)
49{
50 do {
51 /* No more policies in the list */
52 if (list_is_last(&policy->policy_list, &cpufreq_policy_list))
53 return NULL;
54
55 policy = list_next_entry(policy, policy_list);
56 } while (!suitable_policy(policy, active));
57
58 return policy;
59}
60
61static struct cpufreq_policy *first_policy(bool active)
62{
63 struct cpufreq_policy *policy;
64
65 /* No policies in the list */
66 if (list_empty(&cpufreq_policy_list))
67 return NULL;
68
69 policy = list_first_entry(&cpufreq_policy_list, typeof(*policy),
70 policy_list);
71
72 if (!suitable_policy(policy, active))
73 policy = next_policy(policy, active);
74
75 return policy;
76}
77
78/* Macros to iterate over CPU policies */ 41/* Macros to iterate over CPU policies */
79#define for_each_suitable_policy(__policy, __active) \ 42#define for_each_suitable_policy(__policy, __active) \
80 for (__policy = first_policy(__active); \ 43 list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
81 __policy; \ 44 if ((__active) == !policy_is_inactive(__policy))
82 __policy = next_policy(__policy, __active))
83 45
84#define for_each_active_policy(__policy) \ 46#define for_each_active_policy(__policy) \
85 for_each_suitable_policy(__policy, true) 47 for_each_suitable_policy(__policy, true)
@@ -102,7 +64,6 @@ static LIST_HEAD(cpufreq_governor_list);
102static struct cpufreq_driver *cpufreq_driver; 64static struct cpufreq_driver *cpufreq_driver;
103static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); 65static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
104static DEFINE_RWLOCK(cpufreq_driver_lock); 66static DEFINE_RWLOCK(cpufreq_driver_lock);
105DEFINE_MUTEX(cpufreq_governor_lock);
106 67
107/* Flag to suspend/resume CPUFreq governors */ 68/* Flag to suspend/resume CPUFreq governors */
108static bool cpufreq_suspended; 69static bool cpufreq_suspended;
@@ -113,10 +74,8 @@ static inline bool has_target(void)
113} 74}
114 75
115/* internal prototypes */ 76/* internal prototypes */
116static int __cpufreq_governor(struct cpufreq_policy *policy, 77static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
117 unsigned int event);
118static unsigned int __cpufreq_get(struct cpufreq_policy *policy); 78static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
119static void handle_update(struct work_struct *work);
120 79
121/** 80/**
122 * Two notifier lists: the "policy" list is involved in the 81 * Two notifier lists: the "policy" list is involved in the
@@ -818,12 +777,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
818 ssize_t ret; 777 ssize_t ret;
819 778
820 down_read(&policy->rwsem); 779 down_read(&policy->rwsem);
821 780 ret = fattr->show(policy, buf);
822 if (fattr->show)
823 ret = fattr->show(policy, buf);
824 else
825 ret = -EIO;
826
827 up_read(&policy->rwsem); 781 up_read(&policy->rwsem);
828 782
829 return ret; 783 return ret;
@@ -838,18 +792,12 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
838 792
839 get_online_cpus(); 793 get_online_cpus();
840 794
841 if (!cpu_online(policy->cpu)) 795 if (cpu_online(policy->cpu)) {
842 goto unlock; 796 down_write(&policy->rwsem);
843
844 down_write(&policy->rwsem);
845
846 if (fattr->store)
847 ret = fattr->store(policy, buf, count); 797 ret = fattr->store(policy, buf, count);
848 else 798 up_write(&policy->rwsem);
849 ret = -EIO; 799 }
850 800
851 up_write(&policy->rwsem);
852unlock:
853 put_online_cpus(); 801 put_online_cpus();
854 802
855 return ret; 803 return ret;
@@ -959,6 +907,11 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
959 return cpufreq_add_dev_symlink(policy); 907 return cpufreq_add_dev_symlink(policy);
960} 908}
961 909
910__weak struct cpufreq_governor *cpufreq_default_governor(void)
911{
912 return NULL;
913}
914
962static int cpufreq_init_policy(struct cpufreq_policy *policy) 915static int cpufreq_init_policy(struct cpufreq_policy *policy)
963{ 916{
964 struct cpufreq_governor *gov = NULL; 917 struct cpufreq_governor *gov = NULL;
@@ -968,11 +921,14 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy)
968 921
969 /* Update governor of new_policy to the governor used before hotplug */ 922 /* Update governor of new_policy to the governor used before hotplug */
970 gov = find_governor(policy->last_governor); 923 gov = find_governor(policy->last_governor);
971 if (gov) 924 if (gov) {
972 pr_debug("Restoring governor %s for cpu %d\n", 925 pr_debug("Restoring governor %s for cpu %d\n",
973 policy->governor->name, policy->cpu); 926 policy->governor->name, policy->cpu);
974 else 927 } else {
975 gov = CPUFREQ_DEFAULT_GOVERNOR; 928 gov = cpufreq_default_governor();
929 if (!gov)
930 return -ENODATA;
931 }
976 932
977 new_policy.governor = gov; 933 new_policy.governor = gov;
978 934
@@ -996,36 +952,45 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
996 if (cpumask_test_cpu(cpu, policy->cpus)) 952 if (cpumask_test_cpu(cpu, policy->cpus))
997 return 0; 953 return 0;
998 954
955 down_write(&policy->rwsem);
999 if (has_target()) { 956 if (has_target()) {
1000 ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP); 957 ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
1001 if (ret) { 958 if (ret) {
1002 pr_err("%s: Failed to stop governor\n", __func__); 959 pr_err("%s: Failed to stop governor\n", __func__);
1003 return ret; 960 goto unlock;
1004 } 961 }
1005 } 962 }
1006 963
1007 down_write(&policy->rwsem);
1008 cpumask_set_cpu(cpu, policy->cpus); 964 cpumask_set_cpu(cpu, policy->cpus);
1009 up_write(&policy->rwsem);
1010 965
1011 if (has_target()) { 966 if (has_target()) {
1012 ret = __cpufreq_governor(policy, CPUFREQ_GOV_START); 967 ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
1013 if (!ret) 968 if (!ret)
1014 ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); 969 ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
1015 970
1016 if (ret) { 971 if (ret)
1017 pr_err("%s: Failed to start governor\n", __func__); 972 pr_err("%s: Failed to start governor\n", __func__);
1018 return ret;
1019 }
1020 } 973 }
1021 974
1022 return 0; 975unlock:
976 up_write(&policy->rwsem);
977 return ret;
978}
979
980static void handle_update(struct work_struct *work)
981{
982 struct cpufreq_policy *policy =
983 container_of(work, struct cpufreq_policy, update);
984 unsigned int cpu = policy->cpu;
985 pr_debug("handle_update for cpu %u called\n", cpu);
986 cpufreq_update_policy(cpu);
1023} 987}
1024 988
1025static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) 989static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
1026{ 990{
1027 struct device *dev = get_cpu_device(cpu); 991 struct device *dev = get_cpu_device(cpu);
1028 struct cpufreq_policy *policy; 992 struct cpufreq_policy *policy;
993 int ret;
1029 994
1030 if (WARN_ON(!dev)) 995 if (WARN_ON(!dev))
1031 return NULL; 996 return NULL;
@@ -1043,7 +1008,13 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
1043 if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL)) 1008 if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
1044 goto err_free_rcpumask; 1009 goto err_free_rcpumask;
1045 1010
1046 kobject_init(&policy->kobj, &ktype_cpufreq); 1011 ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
1012 cpufreq_global_kobject, "policy%u", cpu);
1013 if (ret) {
1014 pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
1015 goto err_free_real_cpus;
1016 }
1017
1047 INIT_LIST_HEAD(&policy->policy_list); 1018 INIT_LIST_HEAD(&policy->policy_list);
1048 init_rwsem(&policy->rwsem); 1019 init_rwsem(&policy->rwsem);
1049 spin_lock_init(&policy->transition_lock); 1020 spin_lock_init(&policy->transition_lock);
@@ -1054,6 +1025,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
1054 policy->cpu = cpu; 1025 policy->cpu = cpu;
1055 return policy; 1026 return policy;
1056 1027
1028err_free_real_cpus:
1029 free_cpumask_var(policy->real_cpus);
1057err_free_rcpumask: 1030err_free_rcpumask:
1058 free_cpumask_var(policy->related_cpus); 1031 free_cpumask_var(policy->related_cpus);
1059err_free_cpumask: 1032err_free_cpumask:
@@ -1158,16 +1131,6 @@ static int cpufreq_online(unsigned int cpu)
1158 cpumask_copy(policy->related_cpus, policy->cpus); 1131 cpumask_copy(policy->related_cpus, policy->cpus);
1159 /* Remember CPUs present at the policy creation time. */ 1132 /* Remember CPUs present at the policy creation time. */
1160 cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask); 1133 cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
1161
1162 /* Name and add the kobject */
1163 ret = kobject_add(&policy->kobj, cpufreq_global_kobject,
1164 "policy%u",
1165 cpumask_first(policy->related_cpus));
1166 if (ret) {
1167 pr_err("%s: failed to add policy->kobj: %d\n", __func__,
1168 ret);
1169 goto out_exit_policy;
1170 }
1171 } 1134 }
1172 1135
1173 /* 1136 /*
@@ -1309,9 +1272,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
1309 return ret; 1272 return ret;
1310} 1273}
1311 1274
1312static void cpufreq_offline_prepare(unsigned int cpu) 1275static void cpufreq_offline(unsigned int cpu)
1313{ 1276{
1314 struct cpufreq_policy *policy; 1277 struct cpufreq_policy *policy;
1278 int ret;
1315 1279
1316 pr_debug("%s: unregistering CPU %u\n", __func__, cpu); 1280 pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
1317 1281
@@ -1321,13 +1285,13 @@ static void cpufreq_offline_prepare(unsigned int cpu)
1321 return; 1285 return;
1322 } 1286 }
1323 1287
1288 down_write(&policy->rwsem);
1324 if (has_target()) { 1289 if (has_target()) {
1325 int ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP); 1290 ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
1326 if (ret) 1291 if (ret)
1327 pr_err("%s: Failed to stop governor\n", __func__); 1292 pr_err("%s: Failed to stop governor\n", __func__);
1328 } 1293 }
1329 1294
1330 down_write(&policy->rwsem);
1331 cpumask_clear_cpu(cpu, policy->cpus); 1295 cpumask_clear_cpu(cpu, policy->cpus);
1332 1296
1333 if (policy_is_inactive(policy)) { 1297 if (policy_is_inactive(policy)) {
@@ -1340,39 +1304,27 @@ static void cpufreq_offline_prepare(unsigned int cpu)
1340 /* Nominate new CPU */ 1304 /* Nominate new CPU */
1341 policy->cpu = cpumask_any(policy->cpus); 1305 policy->cpu = cpumask_any(policy->cpus);
1342 } 1306 }
1343 up_write(&policy->rwsem);
1344 1307
1345 /* Start governor again for active policy */ 1308 /* Start governor again for active policy */
1346 if (!policy_is_inactive(policy)) { 1309 if (!policy_is_inactive(policy)) {
1347 if (has_target()) { 1310 if (has_target()) {
1348 int ret = __cpufreq_governor(policy, CPUFREQ_GOV_START); 1311 ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
1349 if (!ret) 1312 if (!ret)
1350 ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); 1313 ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
1351 1314
1352 if (ret) 1315 if (ret)
1353 pr_err("%s: Failed to start governor\n", __func__); 1316 pr_err("%s: Failed to start governor\n", __func__);
1354 } 1317 }
1355 } else if (cpufreq_driver->stop_cpu) {
1356 cpufreq_driver->stop_cpu(policy);
1357 }
1358}
1359 1318
1360static void cpufreq_offline_finish(unsigned int cpu) 1319 goto unlock;
1361{
1362 struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
1363
1364 if (!policy) {
1365 pr_debug("%s: No cpu_data found\n", __func__);
1366 return;
1367 } 1320 }
1368 1321
1369 /* Only proceed for inactive policies */ 1322 if (cpufreq_driver->stop_cpu)
1370 if (!policy_is_inactive(policy)) 1323 cpufreq_driver->stop_cpu(policy);
1371 return;
1372 1324
1373 /* If cpu is last user of policy, free policy */ 1325 /* If cpu is last user of policy, free policy */
1374 if (has_target()) { 1326 if (has_target()) {
1375 int ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); 1327 ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
1376 if (ret) 1328 if (ret)
1377 pr_err("%s: Failed to exit governor\n", __func__); 1329 pr_err("%s: Failed to exit governor\n", __func__);
1378 } 1330 }
@@ -1386,6 +1338,9 @@ static void cpufreq_offline_finish(unsigned int cpu)
1386 cpufreq_driver->exit(policy); 1338 cpufreq_driver->exit(policy);
1387 policy->freq_table = NULL; 1339 policy->freq_table = NULL;
1388 } 1340 }
1341
1342unlock:
1343 up_write(&policy->rwsem);
1389} 1344}
1390 1345
1391/** 1346/**
@@ -1401,10 +1356,8 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
1401 if (!policy) 1356 if (!policy)
1402 return; 1357 return;
1403 1358
1404 if (cpu_online(cpu)) { 1359 if (cpu_online(cpu))
1405 cpufreq_offline_prepare(cpu); 1360 cpufreq_offline(cpu);
1406 cpufreq_offline_finish(cpu);
1407 }
1408 1361
1409 cpumask_clear_cpu(cpu, policy->real_cpus); 1362 cpumask_clear_cpu(cpu, policy->real_cpus);
1410 remove_cpu_dev_symlink(policy, cpu); 1363 remove_cpu_dev_symlink(policy, cpu);
@@ -1413,15 +1366,6 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
1413 cpufreq_policy_free(policy, true); 1366 cpufreq_policy_free(policy, true);
1414} 1367}
1415 1368
1416static void handle_update(struct work_struct *work)
1417{
1418 struct cpufreq_policy *policy =
1419 container_of(work, struct cpufreq_policy, update);
1420 unsigned int cpu = policy->cpu;
1421 pr_debug("handle_update for cpu %u called\n", cpu);
1422 cpufreq_update_policy(cpu);
1423}
1424
1425/** 1369/**
1426 * cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're 1370 * cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're
1427 * in deep trouble. 1371 * in deep trouble.
@@ -1584,6 +1528,7 @@ EXPORT_SYMBOL(cpufreq_generic_suspend);
1584void cpufreq_suspend(void) 1528void cpufreq_suspend(void)
1585{ 1529{
1586 struct cpufreq_policy *policy; 1530 struct cpufreq_policy *policy;
1531 int ret;
1587 1532
1588 if (!cpufreq_driver) 1533 if (!cpufreq_driver)
1589 return; 1534 return;
@@ -1594,7 +1539,11 @@ void cpufreq_suspend(void)
1594 pr_debug("%s: Suspending Governors\n", __func__); 1539 pr_debug("%s: Suspending Governors\n", __func__);
1595 1540
1596 for_each_active_policy(policy) { 1541 for_each_active_policy(policy) {
1597 if (__cpufreq_governor(policy, CPUFREQ_GOV_STOP)) 1542 down_write(&policy->rwsem);
1543 ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
1544 up_write(&policy->rwsem);
1545
1546 if (ret)
1598 pr_err("%s: Failed to stop governor for policy: %p\n", 1547 pr_err("%s: Failed to stop governor for policy: %p\n",
1599 __func__, policy); 1548 __func__, policy);
1600 else if (cpufreq_driver->suspend 1549 else if (cpufreq_driver->suspend
@@ -1616,6 +1565,7 @@ suspend:
1616void cpufreq_resume(void) 1565void cpufreq_resume(void)
1617{ 1566{
1618 struct cpufreq_policy *policy; 1567 struct cpufreq_policy *policy;
1568 int ret;
1619 1569
1620 if (!cpufreq_driver) 1570 if (!cpufreq_driver)
1621 return; 1571 return;
@@ -1628,13 +1578,20 @@ void cpufreq_resume(void)
1628 pr_debug("%s: Resuming Governors\n", __func__); 1578 pr_debug("%s: Resuming Governors\n", __func__);
1629 1579
1630 for_each_active_policy(policy) { 1580 for_each_active_policy(policy) {
1631 if (cpufreq_driver->resume && cpufreq_driver->resume(policy)) 1581 if (cpufreq_driver->resume && cpufreq_driver->resume(policy)) {
1632 pr_err("%s: Failed to resume driver: %p\n", __func__, 1582 pr_err("%s: Failed to resume driver: %p\n", __func__,
1633 policy); 1583 policy);
1634 else if (__cpufreq_governor(policy, CPUFREQ_GOV_START) 1584 } else {
1635 || __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS)) 1585 down_write(&policy->rwsem);
1636 pr_err("%s: Failed to start governor for policy: %p\n", 1586 ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
1637 __func__, policy); 1587 if (!ret)
1588 cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
1589 up_write(&policy->rwsem);
1590
1591 if (ret)
1592 pr_err("%s: Failed to start governor for policy: %p\n",
1593 __func__, policy);
1594 }
1638 } 1595 }
1639 1596
1640 /* 1597 /*
@@ -1846,7 +1803,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
1846 unsigned int relation) 1803 unsigned int relation)
1847{ 1804{
1848 unsigned int old_target_freq = target_freq; 1805 unsigned int old_target_freq = target_freq;
1849 int retval = -EINVAL; 1806 struct cpufreq_frequency_table *freq_table;
1807 int index, retval;
1850 1808
1851 if (cpufreq_disabled()) 1809 if (cpufreq_disabled())
1852 return -ENODEV; 1810 return -ENODEV;
@@ -1873,34 +1831,28 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
1873 policy->restore_freq = policy->cur; 1831 policy->restore_freq = policy->cur;
1874 1832
1875 if (cpufreq_driver->target) 1833 if (cpufreq_driver->target)
1876 retval = cpufreq_driver->target(policy, target_freq, relation); 1834 return cpufreq_driver->target(policy, target_freq, relation);
1877 else if (cpufreq_driver->target_index) {
1878 struct cpufreq_frequency_table *freq_table;
1879 int index;
1880
1881 freq_table = cpufreq_frequency_get_table(policy->cpu);
1882 if (unlikely(!freq_table)) {
1883 pr_err("%s: Unable to find freq_table\n", __func__);
1884 goto out;
1885 }
1886 1835
1887 retval = cpufreq_frequency_table_target(policy, freq_table, 1836 if (!cpufreq_driver->target_index)
1888 target_freq, relation, &index); 1837 return -EINVAL;
1889 if (unlikely(retval)) {
1890 pr_err("%s: Unable to find matching freq\n", __func__);
1891 goto out;
1892 }
1893 1838
1894 if (freq_table[index].frequency == policy->cur) { 1839 freq_table = cpufreq_frequency_get_table(policy->cpu);
1895 retval = 0; 1840 if (unlikely(!freq_table)) {
1896 goto out; 1841 pr_err("%s: Unable to find freq_table\n", __func__);
1897 } 1842 return -EINVAL;
1843 }
1898 1844
1899 retval = __target_index(policy, freq_table, index); 1845 retval = cpufreq_frequency_table_target(policy, freq_table, target_freq,
1846 relation, &index);
1847 if (unlikely(retval)) {
1848 pr_err("%s: Unable to find matching freq\n", __func__);
1849 return retval;
1900 } 1850 }
1901 1851
1902out: 1852 if (freq_table[index].frequency == policy->cur)
1903 return retval; 1853 return 0;
1854
1855 return __target_index(policy, freq_table, index);
1904} 1856}
1905EXPORT_SYMBOL_GPL(__cpufreq_driver_target); 1857EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
1906 1858
@@ -1920,20 +1872,14 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
1920} 1872}
1921EXPORT_SYMBOL_GPL(cpufreq_driver_target); 1873EXPORT_SYMBOL_GPL(cpufreq_driver_target);
1922 1874
1923static int __cpufreq_governor(struct cpufreq_policy *policy, 1875__weak struct cpufreq_governor *cpufreq_fallback_governor(void)
1924 unsigned int event)
1925{ 1876{
1926 int ret; 1877 return NULL;
1878}
1927 1879
1928 /* Only must be defined when default governor is known to have latency 1880static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
1929 restrictions, like e.g. conservative or ondemand. 1881{
1930 That this is the case is already ensured in Kconfig 1882 int ret;
1931 */
1932#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
1933 struct cpufreq_governor *gov = &cpufreq_gov_performance;
1934#else
1935 struct cpufreq_governor *gov = NULL;
1936#endif
1937 1883
1938 /* Don't start any governor operations if we are entering suspend */ 1884 /* Don't start any governor operations if we are entering suspend */
1939 if (cpufreq_suspended) 1885 if (cpufreq_suspended)
@@ -1948,12 +1894,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
1948 if (policy->governor->max_transition_latency && 1894 if (policy->governor->max_transition_latency &&
1949 policy->cpuinfo.transition_latency > 1895 policy->cpuinfo.transition_latency >
1950 policy->governor->max_transition_latency) { 1896 policy->governor->max_transition_latency) {
1951 if (!gov) 1897 struct cpufreq_governor *gov = cpufreq_fallback_governor();
1952 return -EINVAL; 1898
1953 else { 1899 if (gov) {
1954 pr_warn("%s governor failed, too long transition latency of HW, fallback to %s governor\n", 1900 pr_warn("%s governor failed, too long transition latency of HW, fallback to %s governor\n",
1955 policy->governor->name, gov->name); 1901 policy->governor->name, gov->name);
1956 policy->governor = gov; 1902 policy->governor = gov;
1903 } else {
1904 return -EINVAL;
1957 } 1905 }
1958 } 1906 }
1959 1907
@@ -1963,21 +1911,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
1963 1911
1964 pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event); 1912 pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
1965 1913
1966 mutex_lock(&cpufreq_governor_lock);
1967 if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
1968 || (!policy->governor_enabled
1969 && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {
1970 mutex_unlock(&cpufreq_governor_lock);
1971 return -EBUSY;
1972 }
1973
1974 if (event == CPUFREQ_GOV_STOP)
1975 policy->governor_enabled = false;
1976 else if (event == CPUFREQ_GOV_START)
1977 policy->governor_enabled = true;
1978
1979 mutex_unlock(&cpufreq_governor_lock);
1980
1981 ret = policy->governor->governor(policy, event); 1914 ret = policy->governor->governor(policy, event);
1982 1915
1983 if (!ret) { 1916 if (!ret) {
@@ -1985,14 +1918,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
1985 policy->governor->initialized++; 1918 policy->governor->initialized++;
1986 else if (event == CPUFREQ_GOV_POLICY_EXIT) 1919 else if (event == CPUFREQ_GOV_POLICY_EXIT)
1987 policy->governor->initialized--; 1920 policy->governor->initialized--;
1988 } else {
1989 /* Restore original values */
1990 mutex_lock(&cpufreq_governor_lock);
1991 if (event == CPUFREQ_GOV_STOP)
1992 policy->governor_enabled = true;
1993 else if (event == CPUFREQ_GOV_START)
1994 policy->governor_enabled = false;
1995 mutex_unlock(&cpufreq_governor_lock);
1996 } 1921 }
1997 1922
1998 if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) || 1923 if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
@@ -2147,7 +2072,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
2147 old_gov = policy->governor; 2072 old_gov = policy->governor;
2148 /* end old governor */ 2073 /* end old governor */
2149 if (old_gov) { 2074 if (old_gov) {
2150 ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP); 2075 ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
2151 if (ret) { 2076 if (ret) {
2152 /* This can happen due to race with other operations */ 2077 /* This can happen due to race with other operations */
2153 pr_debug("%s: Failed to Stop Governor: %s (%d)\n", 2078 pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
@@ -2155,10 +2080,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
2155 return ret; 2080 return ret;
2156 } 2081 }
2157 2082
2158 up_write(&policy->rwsem); 2083 ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
2159 ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
2160 down_write(&policy->rwsem);
2161
2162 if (ret) { 2084 if (ret) {
2163 pr_err("%s: Failed to Exit Governor: %s (%d)\n", 2085 pr_err("%s: Failed to Exit Governor: %s (%d)\n",
2164 __func__, old_gov->name, ret); 2086 __func__, old_gov->name, ret);
@@ -2168,32 +2090,30 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
2168 2090
2169 /* start new governor */ 2091 /* start new governor */
2170 policy->governor = new_policy->governor; 2092 policy->governor = new_policy->governor;
2171 ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT); 2093 ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
2172 if (!ret) { 2094 if (!ret) {
2173 ret = __cpufreq_governor(policy, CPUFREQ_GOV_START); 2095 ret = cpufreq_governor(policy, CPUFREQ_GOV_START);
2174 if (!ret) 2096 if (!ret)
2175 goto out; 2097 goto out;
2176 2098
2177 up_write(&policy->rwsem); 2099 cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
2178 __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
2179 down_write(&policy->rwsem);
2180 } 2100 }
2181 2101
2182 /* new governor failed, so re-start old one */ 2102 /* new governor failed, so re-start old one */
2183 pr_debug("starting governor %s failed\n", policy->governor->name); 2103 pr_debug("starting governor %s failed\n", policy->governor->name);
2184 if (old_gov) { 2104 if (old_gov) {
2185 policy->governor = old_gov; 2105 policy->governor = old_gov;
2186 if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) 2106 if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
2187 policy->governor = NULL; 2107 policy->governor = NULL;
2188 else 2108 else
2189 __cpufreq_governor(policy, CPUFREQ_GOV_START); 2109 cpufreq_governor(policy, CPUFREQ_GOV_START);
2190 } 2110 }
2191 2111
2192 return ret; 2112 return ret;
2193 2113
2194 out: 2114 out:
2195 pr_debug("governor: change or update limits\n"); 2115 pr_debug("governor: change or update limits\n");
2196 return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); 2116 return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
2197} 2117}
2198 2118
2199/** 2119/**
@@ -2260,11 +2180,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
2260 break; 2180 break;
2261 2181
2262 case CPU_DOWN_PREPARE: 2182 case CPU_DOWN_PREPARE:
2263 cpufreq_offline_prepare(cpu); 2183 cpufreq_offline(cpu);
2264 break;
2265
2266 case CPU_POST_DEAD:
2267 cpufreq_offline_finish(cpu);
2268 break; 2184 break;
2269 2185
2270 case CPU_DOWN_FAILED: 2186 case CPU_DOWN_FAILED:
@@ -2297,8 +2213,11 @@ static int cpufreq_boost_set_sw(int state)
2297 __func__); 2213 __func__);
2298 break; 2214 break;
2299 } 2215 }
2216
2217 down_write(&policy->rwsem);
2300 policy->user_policy.max = policy->max; 2218 policy->user_policy.max = policy->max;
2301 __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); 2219 cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
2220 up_write(&policy->rwsem);
2302 } 2221 }
2303 } 2222 }
2304 2223
@@ -2384,7 +2303,7 @@ EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
2384 * submitted by the CPU Frequency driver. 2303 * submitted by the CPU Frequency driver.
2385 * 2304 *
2386 * Registers a CPU Frequency driver to this core code. This code 2305 * Registers a CPU Frequency driver to this core code. This code
2387 * returns zero on success, -EBUSY when another driver got here first 2306 * returns zero on success, -EEXIST when another driver got here first
2388 * (and isn't unregistered in the meantime). 2307 * (and isn't unregistered in the meantime).
2389 * 2308 *
2390 */ 2309 */
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 606ad74abe6e..bf4913f6453b 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -14,6 +14,22 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include "cpufreq_governor.h" 15#include "cpufreq_governor.h"
16 16
17struct cs_policy_dbs_info {
18 struct policy_dbs_info policy_dbs;
19 unsigned int down_skip;
20 unsigned int requested_freq;
21};
22
23static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
24{
25 return container_of(policy_dbs, struct cs_policy_dbs_info, policy_dbs);
26}
27
28struct cs_dbs_tuners {
29 unsigned int down_threshold;
30 unsigned int freq_step;
31};
32
17/* Conservative governor macros */ 33/* Conservative governor macros */
18#define DEF_FREQUENCY_UP_THRESHOLD (80) 34#define DEF_FREQUENCY_UP_THRESHOLD (80)
19#define DEF_FREQUENCY_DOWN_THRESHOLD (20) 35#define DEF_FREQUENCY_DOWN_THRESHOLD (20)
@@ -21,21 +37,6 @@
21#define DEF_SAMPLING_DOWN_FACTOR (1) 37#define DEF_SAMPLING_DOWN_FACTOR (1)
22#define MAX_SAMPLING_DOWN_FACTOR (10) 38#define MAX_SAMPLING_DOWN_FACTOR (10)
23 39
24static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
25
26static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
27 unsigned int event);
28
29#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
30static
31#endif
32struct cpufreq_governor cpufreq_gov_conservative = {
33 .name = "conservative",
34 .governor = cs_cpufreq_governor_dbs,
35 .max_transition_latency = TRANSITION_LATENCY_LIMIT,
36 .owner = THIS_MODULE,
37};
38
39static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners, 40static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
40 struct cpufreq_policy *policy) 41 struct cpufreq_policy *policy)
41{ 42{
@@ -57,27 +58,28 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
57 * Any frequency increase takes it to the maximum frequency. Frequency reduction 58 * Any frequency increase takes it to the maximum frequency. Frequency reduction
58 * happens at minimum steps of 5% (default) of maximum frequency 59 * happens at minimum steps of 5% (default) of maximum frequency
59 */ 60 */
60static void cs_check_cpu(int cpu, unsigned int load) 61static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
61{ 62{
62 struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); 63 struct policy_dbs_info *policy_dbs = policy->governor_data;
63 struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy; 64 struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
64 struct dbs_data *dbs_data = policy->governor_data; 65 struct dbs_data *dbs_data = policy_dbs->dbs_data;
65 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 66 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
67 unsigned int load = dbs_update(policy);
66 68
67 /* 69 /*
68 * break out if we 'cannot' reduce the speed as the user might 70 * break out if we 'cannot' reduce the speed as the user might
69 * want freq_step to be zero 71 * want freq_step to be zero
70 */ 72 */
71 if (cs_tuners->freq_step == 0) 73 if (cs_tuners->freq_step == 0)
72 return; 74 goto out;
73 75
74 /* Check for frequency increase */ 76 /* Check for frequency increase */
75 if (load > cs_tuners->up_threshold) { 77 if (load > dbs_data->up_threshold) {
76 dbs_info->down_skip = 0; 78 dbs_info->down_skip = 0;
77 79
78 /* if we are already at full speed then break out early */ 80 /* if we are already at full speed then break out early */
79 if (dbs_info->requested_freq == policy->max) 81 if (dbs_info->requested_freq == policy->max)
80 return; 82 goto out;
81 83
82 dbs_info->requested_freq += get_freq_target(cs_tuners, policy); 84 dbs_info->requested_freq += get_freq_target(cs_tuners, policy);
83 85
@@ -86,12 +88,12 @@ static void cs_check_cpu(int cpu, unsigned int load)
86 88
87 __cpufreq_driver_target(policy, dbs_info->requested_freq, 89 __cpufreq_driver_target(policy, dbs_info->requested_freq,
88 CPUFREQ_RELATION_H); 90 CPUFREQ_RELATION_H);
89 return; 91 goto out;
90 } 92 }
91 93
92 /* if sampling_down_factor is active break out early */ 94 /* if sampling_down_factor is active break out early */
93 if (++dbs_info->down_skip < cs_tuners->sampling_down_factor) 95 if (++dbs_info->down_skip < dbs_data->sampling_down_factor)
94 return; 96 goto out;
95 dbs_info->down_skip = 0; 97 dbs_info->down_skip = 0;
96 98
97 /* Check for frequency decrease */ 99 /* Check for frequency decrease */
@@ -101,7 +103,7 @@ static void cs_check_cpu(int cpu, unsigned int load)
101 * if we cannot reduce the frequency anymore, break out early 103 * if we cannot reduce the frequency anymore, break out early
102 */ 104 */
103 if (policy->cur == policy->min) 105 if (policy->cur == policy->min)
104 return; 106 goto out;
105 107
106 freq_target = get_freq_target(cs_tuners, policy); 108 freq_target = get_freq_target(cs_tuners, policy);
107 if (dbs_info->requested_freq > freq_target) 109 if (dbs_info->requested_freq > freq_target)
@@ -111,58 +113,25 @@ static void cs_check_cpu(int cpu, unsigned int load)
111 113
112 __cpufreq_driver_target(policy, dbs_info->requested_freq, 114 __cpufreq_driver_target(policy, dbs_info->requested_freq,
113 CPUFREQ_RELATION_L); 115 CPUFREQ_RELATION_L);
114 return;
115 } 116 }
116}
117
118static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
119{
120 struct dbs_data *dbs_data = policy->governor_data;
121 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
122
123 if (modify_all)
124 dbs_check_cpu(dbs_data, policy->cpu);
125 117
126 return delay_for_sampling_rate(cs_tuners->sampling_rate); 118 out:
119 return dbs_data->sampling_rate;
127} 120}
128 121
129static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 122static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
130 void *data) 123 void *data);
131{
132 struct cpufreq_freqs *freq = data;
133 struct cs_cpu_dbs_info_s *dbs_info =
134 &per_cpu(cs_cpu_dbs_info, freq->cpu);
135 struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
136
137 if (!policy)
138 return 0;
139
140 /* policy isn't governed by conservative governor */
141 if (policy->governor != &cpufreq_gov_conservative)
142 return 0;
143
144 /*
145 * we only care if our internally tracked freq moves outside the 'valid'
146 * ranges of frequency available to us otherwise we do not change it
147 */
148 if (dbs_info->requested_freq > policy->max
149 || dbs_info->requested_freq < policy->min)
150 dbs_info->requested_freq = freq->new;
151
152 return 0;
153}
154 124
155static struct notifier_block cs_cpufreq_notifier_block = { 125static struct notifier_block cs_cpufreq_notifier_block = {
156 .notifier_call = dbs_cpufreq_notifier, 126 .notifier_call = dbs_cpufreq_notifier,
157}; 127};
158 128
159/************************** sysfs interface ************************/ 129/************************** sysfs interface ************************/
160static struct common_dbs_data cs_dbs_cdata; 130static struct dbs_governor cs_dbs_gov;
161 131
162static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data, 132static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
163 const char *buf, size_t count) 133 const char *buf, size_t count)
164{ 134{
165 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
166 unsigned int input; 135 unsigned int input;
167 int ret; 136 int ret;
168 ret = sscanf(buf, "%u", &input); 137 ret = sscanf(buf, "%u", &input);
@@ -170,22 +139,7 @@ static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
170 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 139 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
171 return -EINVAL; 140 return -EINVAL;
172 141
173 cs_tuners->sampling_down_factor = input; 142 dbs_data->sampling_down_factor = input;
174 return count;
175}
176
177static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
178 size_t count)
179{
180 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
181 unsigned int input;
182 int ret;
183 ret = sscanf(buf, "%u", &input);
184
185 if (ret != 1)
186 return -EINVAL;
187
188 cs_tuners->sampling_rate = max(input, dbs_data->min_sampling_rate);
189 return count; 143 return count;
190} 144}
191 145
@@ -200,7 +154,7 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
200 if (ret != 1 || input > 100 || input <= cs_tuners->down_threshold) 154 if (ret != 1 || input > 100 || input <= cs_tuners->down_threshold)
201 return -EINVAL; 155 return -EINVAL;
202 156
203 cs_tuners->up_threshold = input; 157 dbs_data->up_threshold = input;
204 return count; 158 return count;
205} 159}
206 160
@@ -214,7 +168,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
214 168
215 /* cannot be lower than 11 otherwise freq will not fall */ 169 /* cannot be lower than 11 otherwise freq will not fall */
216 if (ret != 1 || input < 11 || input > 100 || 170 if (ret != 1 || input < 11 || input > 100 ||
217 input >= cs_tuners->up_threshold) 171 input >= dbs_data->up_threshold)
218 return -EINVAL; 172 return -EINVAL;
219 173
220 cs_tuners->down_threshold = input; 174 cs_tuners->down_threshold = input;
@@ -224,8 +178,7 @@ static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
224static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data, 178static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
225 const char *buf, size_t count) 179 const char *buf, size_t count)
226{ 180{
227 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 181 unsigned int input;
228 unsigned int input, j;
229 int ret; 182 int ret;
230 183
231 ret = sscanf(buf, "%u", &input); 184 ret = sscanf(buf, "%u", &input);
@@ -235,21 +188,14 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
235 if (input > 1) 188 if (input > 1)
236 input = 1; 189 input = 1;
237 190
238 if (input == cs_tuners->ignore_nice_load) /* nothing to do */ 191 if (input == dbs_data->ignore_nice_load) /* nothing to do */
239 return count; 192 return count;
240 193
241 cs_tuners->ignore_nice_load = input; 194 dbs_data->ignore_nice_load = input;
242 195
243 /* we need to re-evaluate prev_cpu_idle */ 196 /* we need to re-evaluate prev_cpu_idle */
244 for_each_online_cpu(j) { 197 gov_update_cpu_data(dbs_data);
245 struct cs_cpu_dbs_info_s *dbs_info; 198
246 dbs_info = &per_cpu(cs_cpu_dbs_info, j);
247 dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
248 &dbs_info->cdbs.prev_cpu_wall, 0);
249 if (cs_tuners->ignore_nice_load)
250 dbs_info->cdbs.prev_cpu_nice =
251 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
252 }
253 return count; 199 return count;
254} 200}
255 201
@@ -275,55 +221,47 @@ static ssize_t store_freq_step(struct dbs_data *dbs_data, const char *buf,
275 return count; 221 return count;
276} 222}
277 223
278show_store_one(cs, sampling_rate); 224gov_show_one_common(sampling_rate);
279show_store_one(cs, sampling_down_factor); 225gov_show_one_common(sampling_down_factor);
280show_store_one(cs, up_threshold); 226gov_show_one_common(up_threshold);
281show_store_one(cs, down_threshold); 227gov_show_one_common(ignore_nice_load);
282show_store_one(cs, ignore_nice_load); 228gov_show_one_common(min_sampling_rate);
283show_store_one(cs, freq_step); 229gov_show_one(cs, down_threshold);
284declare_show_sampling_rate_min(cs); 230gov_show_one(cs, freq_step);
285 231
286gov_sys_pol_attr_rw(sampling_rate); 232gov_attr_rw(sampling_rate);
287gov_sys_pol_attr_rw(sampling_down_factor); 233gov_attr_rw(sampling_down_factor);
288gov_sys_pol_attr_rw(up_threshold); 234gov_attr_rw(up_threshold);
289gov_sys_pol_attr_rw(down_threshold); 235gov_attr_rw(ignore_nice_load);
290gov_sys_pol_attr_rw(ignore_nice_load); 236gov_attr_ro(min_sampling_rate);
291gov_sys_pol_attr_rw(freq_step); 237gov_attr_rw(down_threshold);
292gov_sys_pol_attr_ro(sampling_rate_min); 238gov_attr_rw(freq_step);
293 239
294static struct attribute *dbs_attributes_gov_sys[] = { 240static struct attribute *cs_attributes[] = {
295 &sampling_rate_min_gov_sys.attr, 241 &min_sampling_rate.attr,
296 &sampling_rate_gov_sys.attr, 242 &sampling_rate.attr,
297 &sampling_down_factor_gov_sys.attr, 243 &sampling_down_factor.attr,
298 &up_threshold_gov_sys.attr, 244 &up_threshold.attr,
299 &down_threshold_gov_sys.attr, 245 &down_threshold.attr,
300 &ignore_nice_load_gov_sys.attr, 246 &ignore_nice_load.attr,
301 &freq_step_gov_sys.attr, 247 &freq_step.attr,
302 NULL 248 NULL
303}; 249};
304 250
305static struct attribute_group cs_attr_group_gov_sys = { 251/************************** sysfs end ************************/
306 .attrs = dbs_attributes_gov_sys,
307 .name = "conservative",
308};
309 252
310static struct attribute *dbs_attributes_gov_pol[] = { 253static struct policy_dbs_info *cs_alloc(void)
311 &sampling_rate_min_gov_pol.attr, 254{
312 &sampling_rate_gov_pol.attr, 255 struct cs_policy_dbs_info *dbs_info;
313 &sampling_down_factor_gov_pol.attr,
314 &up_threshold_gov_pol.attr,
315 &down_threshold_gov_pol.attr,
316 &ignore_nice_load_gov_pol.attr,
317 &freq_step_gov_pol.attr,
318 NULL
319};
320 256
321static struct attribute_group cs_attr_group_gov_pol = { 257 dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
322 .attrs = dbs_attributes_gov_pol, 258 return dbs_info ? &dbs_info->policy_dbs : NULL;
323 .name = "conservative", 259}
324};
325 260
326/************************** sysfs end ************************/ 261static void cs_free(struct policy_dbs_info *policy_dbs)
262{
263 kfree(to_dbs_info(policy_dbs));
264}
327 265
328static int cs_init(struct dbs_data *dbs_data, bool notify) 266static int cs_init(struct dbs_data *dbs_data, bool notify)
329{ 267{
@@ -335,11 +273,11 @@ static int cs_init(struct dbs_data *dbs_data, bool notify)
335 return -ENOMEM; 273 return -ENOMEM;
336 } 274 }
337 275
338 tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
339 tuners->down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD; 276 tuners->down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD;
340 tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
341 tuners->ignore_nice_load = 0;
342 tuners->freq_step = DEF_FREQUENCY_STEP; 277 tuners->freq_step = DEF_FREQUENCY_STEP;
278 dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
279 dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
280 dbs_data->ignore_nice_load = 0;
343 281
344 dbs_data->tuners = tuners; 282 dbs_data->tuners = tuners;
345 dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO * 283 dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
@@ -361,35 +299,66 @@ static void cs_exit(struct dbs_data *dbs_data, bool notify)
361 kfree(dbs_data->tuners); 299 kfree(dbs_data->tuners);
362} 300}
363 301
364define_get_cpu_dbs_routines(cs_cpu_dbs_info); 302static void cs_start(struct cpufreq_policy *policy)
303{
304 struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
305
306 dbs_info->down_skip = 0;
307 dbs_info->requested_freq = policy->cur;
308}
365 309
366static struct common_dbs_data cs_dbs_cdata = { 310static struct dbs_governor cs_dbs_gov = {
367 .governor = GOV_CONSERVATIVE, 311 .gov = {
368 .attr_group_gov_sys = &cs_attr_group_gov_sys, 312 .name = "conservative",
369 .attr_group_gov_pol = &cs_attr_group_gov_pol, 313 .governor = cpufreq_governor_dbs,
370 .get_cpu_cdbs = get_cpu_cdbs, 314 .max_transition_latency = TRANSITION_LATENCY_LIMIT,
371 .get_cpu_dbs_info_s = get_cpu_dbs_info_s, 315 .owner = THIS_MODULE,
316 },
317 .kobj_type = { .default_attrs = cs_attributes },
372 .gov_dbs_timer = cs_dbs_timer, 318 .gov_dbs_timer = cs_dbs_timer,
373 .gov_check_cpu = cs_check_cpu, 319 .alloc = cs_alloc,
320 .free = cs_free,
374 .init = cs_init, 321 .init = cs_init,
375 .exit = cs_exit, 322 .exit = cs_exit,
376 .mutex = __MUTEX_INITIALIZER(cs_dbs_cdata.mutex), 323 .start = cs_start,
377}; 324};
378 325
379static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy, 326#define CPU_FREQ_GOV_CONSERVATIVE (&cs_dbs_gov.gov)
380 unsigned int event) 327
328static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
329 void *data)
381{ 330{
382 return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event); 331 struct cpufreq_freqs *freq = data;
332 struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu);
333 struct cs_policy_dbs_info *dbs_info;
334
335 if (!policy)
336 return 0;
337
338 /* policy isn't governed by conservative governor */
339 if (policy->governor != CPU_FREQ_GOV_CONSERVATIVE)
340 return 0;
341
342 dbs_info = to_dbs_info(policy->governor_data);
343 /*
344 * we only care if our internally tracked freq moves outside the 'valid'
345 * ranges of frequency available to us otherwise we do not change it
346 */
347 if (dbs_info->requested_freq > policy->max
348 || dbs_info->requested_freq < policy->min)
349 dbs_info->requested_freq = freq->new;
350
351 return 0;
383} 352}
384 353
385static int __init cpufreq_gov_dbs_init(void) 354static int __init cpufreq_gov_dbs_init(void)
386{ 355{
387 return cpufreq_register_governor(&cpufreq_gov_conservative); 356 return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE);
388} 357}
389 358
390static void __exit cpufreq_gov_dbs_exit(void) 359static void __exit cpufreq_gov_dbs_exit(void)
391{ 360{
392 cpufreq_unregister_governor(&cpufreq_gov_conservative); 361 cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE);
393} 362}
394 363
395MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>"); 364MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
@@ -399,6 +368,11 @@ MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
399MODULE_LICENSE("GPL"); 368MODULE_LICENSE("GPL");
400 369
401#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE 370#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
371struct cpufreq_governor *cpufreq_default_governor(void)
372{
373 return CPU_FREQ_GOV_CONSERVATIVE;
374}
375
402fs_initcall(cpufreq_gov_dbs_init); 376fs_initcall(cpufreq_gov_dbs_init);
403#else 377#else
404module_init(cpufreq_gov_dbs_init); 378module_init(cpufreq_gov_dbs_init);
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index e0d111024d48..1c25ef405616 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -18,95 +18,193 @@
18 18
19#include <linux/export.h> 19#include <linux/export.h>
20#include <linux/kernel_stat.h> 20#include <linux/kernel_stat.h>
21#include <linux/sched.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
22 23
23#include "cpufreq_governor.h" 24#include "cpufreq_governor.h"
24 25
25static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data) 26static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
26{ 27
27 if (have_governor_per_policy()) 28static DEFINE_MUTEX(gov_dbs_data_mutex);
28 return dbs_data->cdata->attr_group_gov_pol;
29 else
30 return dbs_data->cdata->attr_group_gov_sys;
31}
32 29
33void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) 30/* Common sysfs tunables */
31/**
32 * store_sampling_rate - update sampling rate effective immediately if needed.
33 *
34 * If new rate is smaller than the old, simply updating
35 * dbs.sampling_rate might not be appropriate. For example, if the
36 * original sampling_rate was 1 second and the requested new sampling rate is 10
37 * ms because the user needs immediate reaction from ondemand governor, but not
38 * sure if higher frequency will be required or not, then, the governor may
39 * change the sampling rate too late; up to 1 second later. Thus, if we are
40 * reducing the sampling rate, we need to make the new value effective
41 * immediately.
42 *
43 * This must be called with dbs_data->mutex held, otherwise traversing
44 * policy_dbs_list isn't safe.
45 */
46ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
47 size_t count)
34{ 48{
35 struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); 49 struct policy_dbs_info *policy_dbs;
36 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 50 unsigned int rate;
37 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 51 int ret;
38 struct cpufreq_policy *policy = cdbs->shared->policy; 52 ret = sscanf(buf, "%u", &rate);
39 unsigned int sampling_rate; 53 if (ret != 1)
40 unsigned int max_load = 0; 54 return -EINVAL;
41 unsigned int ignore_nice;
42 unsigned int j;
43 55
44 if (dbs_data->cdata->governor == GOV_ONDEMAND) { 56 dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
45 struct od_cpu_dbs_info_s *od_dbs_info =
46 dbs_data->cdata->get_cpu_dbs_info_s(cpu);
47 57
58 /*
59 * We are operating under dbs_data->mutex and so the list and its
60 * entries can't be freed concurrently.
61 */
62 list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
63 mutex_lock(&policy_dbs->timer_mutex);
48 /* 64 /*
49 * Sometimes, the ondemand governor uses an additional 65 * On 32-bit architectures this may race with the
50 * multiplier to give long delays. So apply this multiplier to 66 * sample_delay_ns read in dbs_update_util_handler(), but that
51 * the 'sampling_rate', so as to keep the wake-up-from-idle 67 * really doesn't matter. If the read returns a value that's
52 * detection logic a bit conservative. 68 * too big, the sample will be skipped, but the next invocation
69 * of dbs_update_util_handler() (when the update has been
70 * completed) will take a sample.
71 *
72 * If this runs in parallel with dbs_work_handler(), we may end
73 * up overwriting the sample_delay_ns value that it has just
74 * written, but it will be corrected next time a sample is
75 * taken, so it shouldn't be significant.
53 */ 76 */
54 sampling_rate = od_tuners->sampling_rate; 77 gov_update_sample_delay(policy_dbs, 0);
55 sampling_rate *= od_dbs_info->rate_mult; 78 mutex_unlock(&policy_dbs->timer_mutex);
79 }
56 80
57 ignore_nice = od_tuners->ignore_nice_load; 81 return count;
58 } else { 82}
59 sampling_rate = cs_tuners->sampling_rate; 83EXPORT_SYMBOL_GPL(store_sampling_rate);
60 ignore_nice = cs_tuners->ignore_nice_load; 84
85/**
86 * gov_update_cpu_data - Update CPU load data.
87 * @dbs_data: Top-level governor data pointer.
88 *
89 * Update CPU load data for all CPUs in the domain governed by @dbs_data
90 * (that may be a single policy or a bunch of them if governor tunables are
91 * system-wide).
92 *
93 * Call under the @dbs_data mutex.
94 */
95void gov_update_cpu_data(struct dbs_data *dbs_data)
96{
97 struct policy_dbs_info *policy_dbs;
98
99 list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
100 unsigned int j;
101
102 for_each_cpu(j, policy_dbs->policy->cpus) {
103 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
104
105 j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall,
106 dbs_data->io_is_busy);
107 if (dbs_data->ignore_nice_load)
108 j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
109 }
61 } 110 }
111}
112EXPORT_SYMBOL_GPL(gov_update_cpu_data);
113
114static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
115{
116 return container_of(kobj, struct dbs_data, kobj);
117}
118
119static inline struct governor_attr *to_gov_attr(struct attribute *attr)
120{
121 return container_of(attr, struct governor_attr, attr);
122}
123
124static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
125 char *buf)
126{
127 struct dbs_data *dbs_data = to_dbs_data(kobj);
128 struct governor_attr *gattr = to_gov_attr(attr);
129
130 return gattr->show(dbs_data, buf);
131}
132
133static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
134 const char *buf, size_t count)
135{
136 struct dbs_data *dbs_data = to_dbs_data(kobj);
137 struct governor_attr *gattr = to_gov_attr(attr);
138 int ret = -EBUSY;
139
140 mutex_lock(&dbs_data->mutex);
141
142 if (dbs_data->usage_count)
143 ret = gattr->store(dbs_data, buf, count);
144
145 mutex_unlock(&dbs_data->mutex);
146
147 return ret;
148}
149
150/*
151 * Sysfs Ops for accessing governor attributes.
152 *
153 * All show/store invocations for governor specific sysfs attributes, will first
154 * call the below show/store callbacks and the attribute specific callback will
155 * be called from within it.
156 */
157static const struct sysfs_ops governor_sysfs_ops = {
158 .show = governor_show,
159 .store = governor_store,
160};
161
162unsigned int dbs_update(struct cpufreq_policy *policy)
163{
164 struct policy_dbs_info *policy_dbs = policy->governor_data;
165 struct dbs_data *dbs_data = policy_dbs->dbs_data;
166 unsigned int ignore_nice = dbs_data->ignore_nice_load;
167 unsigned int max_load = 0;
168 unsigned int sampling_rate, io_busy, j;
169
170 /*
171 * Sometimes governors may use an additional multiplier to increase
172 * sample delays temporarily. Apply that multiplier to sampling_rate
173 * so as to keep the wake-up-from-idle detection logic a bit
174 * conservative.
175 */
176 sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
177 /*
178 * For the purpose of ondemand, waiting for disk IO is an indication
179 * that you're performance critical, and not that the system is actually
180 * idle, so do not add the iowait time to the CPU idle time then.
181 */
182 io_busy = dbs_data->io_is_busy;
62 183
63 /* Get Absolute Load */ 184 /* Get Absolute Load */
64 for_each_cpu(j, policy->cpus) { 185 for_each_cpu(j, policy->cpus) {
65 struct cpu_dbs_info *j_cdbs; 186 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
66 u64 cur_wall_time, cur_idle_time; 187 u64 cur_wall_time, cur_idle_time;
67 unsigned int idle_time, wall_time; 188 unsigned int idle_time, wall_time;
68 unsigned int load; 189 unsigned int load;
69 int io_busy = 0;
70
71 j_cdbs = dbs_data->cdata->get_cpu_cdbs(j);
72 190
73 /*
74 * For the purpose of ondemand, waiting for disk IO is
75 * an indication that you're performance critical, and
76 * not that the system is actually idle. So do not add
77 * the iowait time to the cpu idle time.
78 */
79 if (dbs_data->cdata->governor == GOV_ONDEMAND)
80 io_busy = od_tuners->io_is_busy;
81 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy); 191 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
82 192
83 wall_time = (unsigned int) 193 wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
84 (cur_wall_time - j_cdbs->prev_cpu_wall);
85 j_cdbs->prev_cpu_wall = cur_wall_time; 194 j_cdbs->prev_cpu_wall = cur_wall_time;
86 195
87 if (cur_idle_time < j_cdbs->prev_cpu_idle) 196 if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
88 cur_idle_time = j_cdbs->prev_cpu_idle; 197 idle_time = 0;
89 198 } else {
90 idle_time = (unsigned int) 199 idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
91 (cur_idle_time - j_cdbs->prev_cpu_idle); 200 j_cdbs->prev_cpu_idle = cur_idle_time;
92 j_cdbs->prev_cpu_idle = cur_idle_time; 201 }
93 202
94 if (ignore_nice) { 203 if (ignore_nice) {
95 u64 cur_nice; 204 u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
96 unsigned long cur_nice_jiffies;
97
98 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
99 cdbs->prev_cpu_nice;
100 /*
101 * Assumption: nice time between sampling periods will
102 * be less than 2^32 jiffies for 32 bit sys
103 */
104 cur_nice_jiffies = (unsigned long)
105 cputime64_to_jiffies64(cur_nice);
106 205
107 cdbs->prev_cpu_nice = 206 idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
108 kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 207 j_cdbs->prev_cpu_nice = cur_nice;
109 idle_time += jiffies_to_usecs(cur_nice_jiffies);
110 } 208 }
111 209
112 if (unlikely(!wall_time || wall_time < idle_time)) 210 if (unlikely(!wall_time || wall_time < idle_time))
@@ -128,10 +226,10 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
128 * dropped down. So we perform the copy only once, upon the 226 * dropped down. So we perform the copy only once, upon the
129 * first wake-up from idle.) 227 * first wake-up from idle.)
130 * 228 *
131 * Detecting this situation is easy: the governor's deferrable 229 * Detecting this situation is easy: the governor's utilization
132 * timer would not have fired during CPU-idle periods. Hence 230 * update handler would not have run during CPU-idle periods.
133 * an unusually large 'wall_time' (as compared to the sampling 231 * Hence, an unusually large 'wall_time' (as compared to the
134 * rate) indicates this scenario. 232 * sampling rate) indicates this scenario.
135 * 233 *
136 * prev_load can be zero in two cases and we must recalculate it 234 * prev_load can be zero in two cases and we must recalculate it
137 * for both cases: 235 * for both cases:
@@ -156,222 +254,224 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
156 if (load > max_load) 254 if (load > max_load)
157 max_load = load; 255 max_load = load;
158 } 256 }
159 257 return max_load;
160 dbs_data->cdata->gov_check_cpu(cpu, max_load);
161} 258}
162EXPORT_SYMBOL_GPL(dbs_check_cpu); 259EXPORT_SYMBOL_GPL(dbs_update);
163 260
164void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay) 261static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
262 unsigned int delay_us)
165{ 263{
166 struct dbs_data *dbs_data = policy->governor_data; 264 struct cpufreq_policy *policy = policy_dbs->policy;
167 struct cpu_dbs_info *cdbs;
168 int cpu; 265 int cpu;
169 266
267 gov_update_sample_delay(policy_dbs, delay_us);
268 policy_dbs->last_sample_time = 0;
269
170 for_each_cpu(cpu, policy->cpus) { 270 for_each_cpu(cpu, policy->cpus) {
171 cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); 271 struct cpu_dbs_info *cdbs = &per_cpu(cpu_dbs, cpu);
172 cdbs->timer.expires = jiffies + delay; 272
173 add_timer_on(&cdbs->timer, cpu); 273 cpufreq_set_update_util_data(cpu, &cdbs->update_util);
174 } 274 }
175} 275}
176EXPORT_SYMBOL_GPL(gov_add_timers);
177 276
178static inline void gov_cancel_timers(struct cpufreq_policy *policy) 277static inline void gov_clear_update_util(struct cpufreq_policy *policy)
179{ 278{
180 struct dbs_data *dbs_data = policy->governor_data;
181 struct cpu_dbs_info *cdbs;
182 int i; 279 int i;
183 280
184 for_each_cpu(i, policy->cpus) { 281 for_each_cpu(i, policy->cpus)
185 cdbs = dbs_data->cdata->get_cpu_cdbs(i); 282 cpufreq_set_update_util_data(i, NULL);
186 del_timer_sync(&cdbs->timer);
187 }
188}
189 283
190void gov_cancel_work(struct cpu_common_dbs_info *shared) 284 synchronize_sched();
191{
192 /* Tell dbs_timer_handler() to skip queuing up work items. */
193 atomic_inc(&shared->skip_work);
194 /*
195 * If dbs_timer_handler() is already running, it may not notice the
196 * incremented skip_work, so wait for it to complete to prevent its work
197 * item from being queued up after the cancel_work_sync() below.
198 */
199 gov_cancel_timers(shared->policy);
200 /*
201 * In case dbs_timer_handler() managed to run and spawn a work item
202 * before the timers have been canceled, wait for that work item to
203 * complete and then cancel all of the timers set up by it. If
204 * dbs_timer_handler() runs again at that point, it will see the
205 * positive value of skip_work and won't spawn any more work items.
206 */
207 cancel_work_sync(&shared->work);
208 gov_cancel_timers(shared->policy);
209 atomic_set(&shared->skip_work, 0);
210} 285}
211EXPORT_SYMBOL_GPL(gov_cancel_work);
212 286
213/* Will return if we need to evaluate cpu load again or not */ 287static void gov_cancel_work(struct cpufreq_policy *policy)
214static bool need_load_eval(struct cpu_common_dbs_info *shared,
215 unsigned int sampling_rate)
216{ 288{
217 if (policy_is_shared(shared->policy)) { 289 struct policy_dbs_info *policy_dbs = policy->governor_data;
218 ktime_t time_now = ktime_get();
219 s64 delta_us = ktime_us_delta(time_now, shared->time_stamp);
220
221 /* Do nothing if we recently have sampled */
222 if (delta_us < (s64)(sampling_rate / 2))
223 return false;
224 else
225 shared->time_stamp = time_now;
226 }
227 290
228 return true; 291 gov_clear_update_util(policy_dbs->policy);
292 irq_work_sync(&policy_dbs->irq_work);
293 cancel_work_sync(&policy_dbs->work);
294 atomic_set(&policy_dbs->work_count, 0);
295 policy_dbs->work_in_progress = false;
229} 296}
230 297
231static void dbs_work_handler(struct work_struct *work) 298static void dbs_work_handler(struct work_struct *work)
232{ 299{
233 struct cpu_common_dbs_info *shared = container_of(work, struct 300 struct policy_dbs_info *policy_dbs;
234 cpu_common_dbs_info, work);
235 struct cpufreq_policy *policy; 301 struct cpufreq_policy *policy;
236 struct dbs_data *dbs_data; 302 struct dbs_governor *gov;
237 unsigned int sampling_rate, delay;
238 bool eval_load;
239
240 policy = shared->policy;
241 dbs_data = policy->governor_data;
242 303
243 /* Kill all timers */ 304 policy_dbs = container_of(work, struct policy_dbs_info, work);
244 gov_cancel_timers(policy); 305 policy = policy_dbs->policy;
306 gov = dbs_governor_of(policy);
245 307
246 if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { 308 /*
247 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 309 * Make sure cpufreq_governor_limits() isn't evaluating load or the
248 310 * ondemand governor isn't updating the sampling rate in parallel.
249 sampling_rate = cs_tuners->sampling_rate; 311 */
250 } else { 312 mutex_lock(&policy_dbs->timer_mutex);
251 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 313 gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
252 314 mutex_unlock(&policy_dbs->timer_mutex);
253 sampling_rate = od_tuners->sampling_rate;
254 }
255
256 eval_load = need_load_eval(shared, sampling_rate);
257 315
316 /* Allow the utilization update handler to queue up more work. */
317 atomic_set(&policy_dbs->work_count, 0);
258 /* 318 /*
259 * Make sure cpufreq_governor_limits() isn't evaluating load in 319 * If the update below is reordered with respect to the sample delay
260 * parallel. 320 * modification, the utilization update handler may end up using a stale
321 * sample delay value.
261 */ 322 */
262 mutex_lock(&shared->timer_mutex); 323 smp_wmb();
263 delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load); 324 policy_dbs->work_in_progress = false;
264 mutex_unlock(&shared->timer_mutex); 325}
265 326
266 atomic_dec(&shared->skip_work); 327static void dbs_irq_work(struct irq_work *irq_work)
328{
329 struct policy_dbs_info *policy_dbs;
267 330
268 gov_add_timers(policy, delay); 331 policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
332 schedule_work(&policy_dbs->work);
269} 333}
270 334
271static void dbs_timer_handler(unsigned long data) 335static void dbs_update_util_handler(struct update_util_data *data, u64 time,
336 unsigned long util, unsigned long max)
272{ 337{
273 struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data; 338 struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
274 struct cpu_common_dbs_info *shared = cdbs->shared; 339 struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
340 u64 delta_ns, lst;
275 341
276 /* 342 /*
277 * Timer handler may not be allowed to queue the work at the moment, 343 * The work may not be allowed to be queued up right now.
278 * because: 344 * Possible reasons:
279 * - Another timer handler has done that 345 * - Work has already been queued up or is in progress.
280 * - We are stopping the governor 346 * - It is too early (too little time from the previous sample).
281 * - Or we are updating the sampling rate of the ondemand governor
282 */ 347 */
283 if (atomic_inc_return(&shared->skip_work) > 1) 348 if (policy_dbs->work_in_progress)
284 atomic_dec(&shared->skip_work); 349 return;
285 else
286 queue_work(system_wq, &shared->work);
287}
288 350
289static void set_sampling_rate(struct dbs_data *dbs_data, 351 /*
290 unsigned int sampling_rate) 352 * If the reads below are reordered before the check above, the value
291{ 353 * of sample_delay_ns used in the computation may be stale.
292 if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { 354 */
293 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 355 smp_rmb();
294 cs_tuners->sampling_rate = sampling_rate; 356 lst = READ_ONCE(policy_dbs->last_sample_time);
295 } else { 357 delta_ns = time - lst;
296 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 358 if ((s64)delta_ns < policy_dbs->sample_delay_ns)
297 od_tuners->sampling_rate = sampling_rate; 359 return;
360
361 /*
362 * If the policy is not shared, the irq_work may be queued up right away
363 * at this point. Otherwise, we need to ensure that only one of the
364 * CPUs sharing the policy will do that.
365 */
366 if (policy_dbs->is_shared) {
367 if (!atomic_add_unless(&policy_dbs->work_count, 1, 1))
368 return;
369
370 /*
371 * If another CPU updated last_sample_time in the meantime, we
372 * shouldn't be here, so clear the work counter and bail out.
373 */
374 if (unlikely(lst != READ_ONCE(policy_dbs->last_sample_time))) {
375 atomic_set(&policy_dbs->work_count, 0);
376 return;
377 }
298 } 378 }
379
380 policy_dbs->last_sample_time = time;
381 policy_dbs->work_in_progress = true;
382 irq_work_queue(&policy_dbs->irq_work);
299} 383}
300 384
301static int alloc_common_dbs_info(struct cpufreq_policy *policy, 385static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
302 struct common_dbs_data *cdata) 386 struct dbs_governor *gov)
303{ 387{
304 struct cpu_common_dbs_info *shared; 388 struct policy_dbs_info *policy_dbs;
305 int j; 389 int j;
306 390
307 /* Allocate memory for the common information for policy->cpus */ 391 /* Allocate memory for per-policy governor data. */
308 shared = kzalloc(sizeof(*shared), GFP_KERNEL); 392 policy_dbs = gov->alloc();
309 if (!shared) 393 if (!policy_dbs)
310 return -ENOMEM; 394 return NULL;
311 395
312 /* Set shared for all CPUs, online+offline */ 396 policy_dbs->policy = policy;
313 for_each_cpu(j, policy->related_cpus) 397 mutex_init(&policy_dbs->timer_mutex);
314 cdata->get_cpu_cdbs(j)->shared = shared; 398 atomic_set(&policy_dbs->work_count, 0);
399 init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
400 INIT_WORK(&policy_dbs->work, dbs_work_handler);
315 401
316 mutex_init(&shared->timer_mutex); 402 /* Set policy_dbs for all CPUs, online+offline */
317 atomic_set(&shared->skip_work, 0); 403 for_each_cpu(j, policy->related_cpus) {
318 INIT_WORK(&shared->work, dbs_work_handler); 404 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
319 return 0; 405
406 j_cdbs->policy_dbs = policy_dbs;
407 j_cdbs->update_util.func = dbs_update_util_handler;
408 }
409 return policy_dbs;
320} 410}
321 411
322static void free_common_dbs_info(struct cpufreq_policy *policy, 412static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
323 struct common_dbs_data *cdata) 413 struct dbs_governor *gov)
324{ 414{
325 struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu);
326 struct cpu_common_dbs_info *shared = cdbs->shared;
327 int j; 415 int j;
328 416
329 mutex_destroy(&shared->timer_mutex); 417 mutex_destroy(&policy_dbs->timer_mutex);
330 418
331 for_each_cpu(j, policy->cpus) 419 for_each_cpu(j, policy_dbs->policy->related_cpus) {
332 cdata->get_cpu_cdbs(j)->shared = NULL; 420 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
333 421
334 kfree(shared); 422 j_cdbs->policy_dbs = NULL;
423 j_cdbs->update_util.func = NULL;
424 }
425 gov->free(policy_dbs);
335} 426}
336 427
337static int cpufreq_governor_init(struct cpufreq_policy *policy, 428static int cpufreq_governor_init(struct cpufreq_policy *policy)
338 struct dbs_data *dbs_data,
339 struct common_dbs_data *cdata)
340{ 429{
430 struct dbs_governor *gov = dbs_governor_of(policy);
431 struct dbs_data *dbs_data;
432 struct policy_dbs_info *policy_dbs;
341 unsigned int latency; 433 unsigned int latency;
342 int ret; 434 int ret = 0;
343 435
344 /* State should be equivalent to EXIT */ 436 /* State should be equivalent to EXIT */
345 if (policy->governor_data) 437 if (policy->governor_data)
346 return -EBUSY; 438 return -EBUSY;
347 439
348 if (dbs_data) { 440 policy_dbs = alloc_policy_dbs_info(policy, gov);
349 if (WARN_ON(have_governor_per_policy())) 441 if (!policy_dbs)
350 return -EINVAL; 442 return -ENOMEM;
351 443
352 ret = alloc_common_dbs_info(policy, cdata); 444 /* Protect gov->gdbs_data against concurrent updates. */
353 if (ret) 445 mutex_lock(&gov_dbs_data_mutex);
354 return ret;
355 446
447 dbs_data = gov->gdbs_data;
448 if (dbs_data) {
449 if (WARN_ON(have_governor_per_policy())) {
450 ret = -EINVAL;
451 goto free_policy_dbs_info;
452 }
453 policy_dbs->dbs_data = dbs_data;
454 policy->governor_data = policy_dbs;
455
456 mutex_lock(&dbs_data->mutex);
356 dbs_data->usage_count++; 457 dbs_data->usage_count++;
357 policy->governor_data = dbs_data; 458 list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
358 return 0; 459 mutex_unlock(&dbs_data->mutex);
460 goto out;
359 } 461 }
360 462
361 dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL); 463 dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
362 if (!dbs_data) 464 if (!dbs_data) {
363 return -ENOMEM; 465 ret = -ENOMEM;
364 466 goto free_policy_dbs_info;
365 ret = alloc_common_dbs_info(policy, cdata); 467 }
366 if (ret)
367 goto free_dbs_data;
368 468
369 dbs_data->cdata = cdata; 469 INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
370 dbs_data->usage_count = 1; 470 mutex_init(&dbs_data->mutex);
371 471
372 ret = cdata->init(dbs_data, !policy->governor->initialized); 472 ret = gov->init(dbs_data, !policy->governor->initialized);
373 if (ret) 473 if (ret)
374 goto free_common_dbs_info; 474 goto free_policy_dbs_info;
375 475
376 /* policy latency is in ns. Convert it to us first */ 476 /* policy latency is in ns. Convert it to us first */
377 latency = policy->cpuinfo.transition_latency / 1000; 477 latency = policy->cpuinfo.transition_latency / 1000;
@@ -381,216 +481,156 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
381 /* Bring kernel and HW constraints together */ 481 /* Bring kernel and HW constraints together */
382 dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate, 482 dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
383 MIN_LATENCY_MULTIPLIER * latency); 483 MIN_LATENCY_MULTIPLIER * latency);
384 set_sampling_rate(dbs_data, max(dbs_data->min_sampling_rate, 484 dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
385 latency * LATENCY_MULTIPLIER)); 485 LATENCY_MULTIPLIER * latency);
386 486
387 if (!have_governor_per_policy()) 487 if (!have_governor_per_policy())
388 cdata->gdbs_data = dbs_data; 488 gov->gdbs_data = dbs_data;
389 489
390 policy->governor_data = dbs_data; 490 policy->governor_data = policy_dbs;
391 491
392 ret = sysfs_create_group(get_governor_parent_kobj(policy), 492 policy_dbs->dbs_data = dbs_data;
393 get_sysfs_attr(dbs_data)); 493 dbs_data->usage_count = 1;
394 if (ret) 494 list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
395 goto reset_gdbs_data;
396 495
397 return 0; 496 gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
497 ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
498 get_governor_parent_kobj(policy),
499 "%s", gov->gov.name);
500 if (!ret)
501 goto out;
502
503 /* Failure, so roll back. */
504 pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
398 505
399reset_gdbs_data:
400 policy->governor_data = NULL; 506 policy->governor_data = NULL;
401 507
402 if (!have_governor_per_policy()) 508 if (!have_governor_per_policy())
403 cdata->gdbs_data = NULL; 509 gov->gdbs_data = NULL;
404 cdata->exit(dbs_data, !policy->governor->initialized); 510 gov->exit(dbs_data, !policy->governor->initialized);
405free_common_dbs_info:
406 free_common_dbs_info(policy, cdata);
407free_dbs_data:
408 kfree(dbs_data); 511 kfree(dbs_data);
512
513free_policy_dbs_info:
514 free_policy_dbs_info(policy_dbs, gov);
515
516out:
517 mutex_unlock(&gov_dbs_data_mutex);
409 return ret; 518 return ret;
410} 519}
411 520
412static int cpufreq_governor_exit(struct cpufreq_policy *policy, 521static int cpufreq_governor_exit(struct cpufreq_policy *policy)
413 struct dbs_data *dbs_data)
414{ 522{
415 struct common_dbs_data *cdata = dbs_data->cdata; 523 struct dbs_governor *gov = dbs_governor_of(policy);
416 struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(policy->cpu); 524 struct policy_dbs_info *policy_dbs = policy->governor_data;
525 struct dbs_data *dbs_data = policy_dbs->dbs_data;
526 int count;
417 527
418 /* State should be equivalent to INIT */ 528 /* Protect gov->gdbs_data against concurrent updates. */
419 if (!cdbs->shared || cdbs->shared->policy) 529 mutex_lock(&gov_dbs_data_mutex);
420 return -EBUSY; 530
531 mutex_lock(&dbs_data->mutex);
532 list_del(&policy_dbs->list);
533 count = --dbs_data->usage_count;
534 mutex_unlock(&dbs_data->mutex);
421 535
422 if (!--dbs_data->usage_count) { 536 if (!count) {
423 sysfs_remove_group(get_governor_parent_kobj(policy), 537 kobject_put(&dbs_data->kobj);
424 get_sysfs_attr(dbs_data));
425 538
426 policy->governor_data = NULL; 539 policy->governor_data = NULL;
427 540
428 if (!have_governor_per_policy()) 541 if (!have_governor_per_policy())
429 cdata->gdbs_data = NULL; 542 gov->gdbs_data = NULL;
430 543
431 cdata->exit(dbs_data, policy->governor->initialized == 1); 544 gov->exit(dbs_data, policy->governor->initialized == 1);
545 mutex_destroy(&dbs_data->mutex);
432 kfree(dbs_data); 546 kfree(dbs_data);
433 } else { 547 } else {
434 policy->governor_data = NULL; 548 policy->governor_data = NULL;
435 } 549 }
436 550
437 free_common_dbs_info(policy, cdata); 551 free_policy_dbs_info(policy_dbs, gov);
552
553 mutex_unlock(&gov_dbs_data_mutex);
438 return 0; 554 return 0;
439} 555}
440 556
441static int cpufreq_governor_start(struct cpufreq_policy *policy, 557static int cpufreq_governor_start(struct cpufreq_policy *policy)
442 struct dbs_data *dbs_data)
443{ 558{
444 struct common_dbs_data *cdata = dbs_data->cdata; 559 struct dbs_governor *gov = dbs_governor_of(policy);
445 unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu; 560 struct policy_dbs_info *policy_dbs = policy->governor_data;
446 struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu); 561 struct dbs_data *dbs_data = policy_dbs->dbs_data;
447 struct cpu_common_dbs_info *shared = cdbs->shared; 562 unsigned int sampling_rate, ignore_nice, j;
448 int io_busy = 0; 563 unsigned int io_busy;
449 564
450 if (!policy->cur) 565 if (!policy->cur)
451 return -EINVAL; 566 return -EINVAL;
452 567
453 /* State should be equivalent to INIT */ 568 policy_dbs->is_shared = policy_is_shared(policy);
454 if (!shared || shared->policy) 569 policy_dbs->rate_mult = 1;
455 return -EBUSY;
456 570
457 if (cdata->governor == GOV_CONSERVATIVE) { 571 sampling_rate = dbs_data->sampling_rate;
458 struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; 572 ignore_nice = dbs_data->ignore_nice_load;
459 573 io_busy = dbs_data->io_is_busy;
460 sampling_rate = cs_tuners->sampling_rate;
461 ignore_nice = cs_tuners->ignore_nice_load;
462 } else {
463 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
464
465 sampling_rate = od_tuners->sampling_rate;
466 ignore_nice = od_tuners->ignore_nice_load;
467 io_busy = od_tuners->io_is_busy;
468 }
469
470 shared->policy = policy;
471 shared->time_stamp = ktime_get();
472 574
473 for_each_cpu(j, policy->cpus) { 575 for_each_cpu(j, policy->cpus) {
474 struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j); 576 struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
475 unsigned int prev_load; 577 unsigned int prev_load;
476 578
477 j_cdbs->prev_cpu_idle = 579 j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
478 get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
479 580
480 prev_load = (unsigned int)(j_cdbs->prev_cpu_wall - 581 prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
481 j_cdbs->prev_cpu_idle); 582 j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
482 j_cdbs->prev_load = 100 * prev_load /
483 (unsigned int)j_cdbs->prev_cpu_wall;
484 583
485 if (ignore_nice) 584 if (ignore_nice)
486 j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 585 j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
487
488 __setup_timer(&j_cdbs->timer, dbs_timer_handler,
489 (unsigned long)j_cdbs,
490 TIMER_DEFERRABLE | TIMER_IRQSAFE);
491 } 586 }
492 587
493 if (cdata->governor == GOV_CONSERVATIVE) { 588 gov->start(policy);
494 struct cs_cpu_dbs_info_s *cs_dbs_info =
495 cdata->get_cpu_dbs_info_s(cpu);
496
497 cs_dbs_info->down_skip = 0;
498 cs_dbs_info->requested_freq = policy->cur;
499 } else {
500 struct od_ops *od_ops = cdata->gov_ops;
501 struct od_cpu_dbs_info_s *od_dbs_info = cdata->get_cpu_dbs_info_s(cpu);
502
503 od_dbs_info->rate_mult = 1;
504 od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
505 od_ops->powersave_bias_init_cpu(cpu);
506 }
507 589
508 gov_add_timers(policy, delay_for_sampling_rate(sampling_rate)); 590 gov_set_update_util(policy_dbs, sampling_rate);
509 return 0; 591 return 0;
510} 592}
511 593
512static int cpufreq_governor_stop(struct cpufreq_policy *policy, 594static int cpufreq_governor_stop(struct cpufreq_policy *policy)
513 struct dbs_data *dbs_data)
514{ 595{
515 struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(policy->cpu); 596 gov_cancel_work(policy);
516 struct cpu_common_dbs_info *shared = cdbs->shared;
517
518 /* State should be equivalent to START */
519 if (!shared || !shared->policy)
520 return -EBUSY;
521
522 gov_cancel_work(shared);
523 shared->policy = NULL;
524
525 return 0; 597 return 0;
526} 598}
527 599
528static int cpufreq_governor_limits(struct cpufreq_policy *policy, 600static int cpufreq_governor_limits(struct cpufreq_policy *policy)
529 struct dbs_data *dbs_data)
530{ 601{
531 struct common_dbs_data *cdata = dbs_data->cdata; 602 struct policy_dbs_info *policy_dbs = policy->governor_data;
532 unsigned int cpu = policy->cpu;
533 struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu);
534 603
535 /* State should be equivalent to START */ 604 mutex_lock(&policy_dbs->timer_mutex);
536 if (!cdbs->shared || !cdbs->shared->policy) 605
537 return -EBUSY; 606 if (policy->max < policy->cur)
607 __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
608 else if (policy->min > policy->cur)
609 __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
610
611 gov_update_sample_delay(policy_dbs, 0);
538 612
539 mutex_lock(&cdbs->shared->timer_mutex); 613 mutex_unlock(&policy_dbs->timer_mutex);
540 if (policy->max < cdbs->shared->policy->cur)
541 __cpufreq_driver_target(cdbs->shared->policy, policy->max,
542 CPUFREQ_RELATION_H);
543 else if (policy->min > cdbs->shared->policy->cur)
544 __cpufreq_driver_target(cdbs->shared->policy, policy->min,
545 CPUFREQ_RELATION_L);
546 dbs_check_cpu(dbs_data, cpu);
547 mutex_unlock(&cdbs->shared->timer_mutex);
548 614
549 return 0; 615 return 0;
550} 616}
551 617
552int cpufreq_governor_dbs(struct cpufreq_policy *policy, 618int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
553 struct common_dbs_data *cdata, unsigned int event)
554{ 619{
555 struct dbs_data *dbs_data; 620 if (event == CPUFREQ_GOV_POLICY_INIT) {
556 int ret; 621 return cpufreq_governor_init(policy);
557 622 } else if (policy->governor_data) {
558 /* Lock governor to block concurrent initialization of governor */ 623 switch (event) {
559 mutex_lock(&cdata->mutex); 624 case CPUFREQ_GOV_POLICY_EXIT:
560 625 return cpufreq_governor_exit(policy);
561 if (have_governor_per_policy()) 626 case CPUFREQ_GOV_START:
562 dbs_data = policy->governor_data; 627 return cpufreq_governor_start(policy);
563 else 628 case CPUFREQ_GOV_STOP:
564 dbs_data = cdata->gdbs_data; 629 return cpufreq_governor_stop(policy);
565 630 case CPUFREQ_GOV_LIMITS:
566 if (!dbs_data && (event != CPUFREQ_GOV_POLICY_INIT)) { 631 return cpufreq_governor_limits(policy);
567 ret = -EINVAL; 632 }
568 goto unlock;
569 }
570
571 switch (event) {
572 case CPUFREQ_GOV_POLICY_INIT:
573 ret = cpufreq_governor_init(policy, dbs_data, cdata);
574 break;
575 case CPUFREQ_GOV_POLICY_EXIT:
576 ret = cpufreq_governor_exit(policy, dbs_data);
577 break;
578 case CPUFREQ_GOV_START:
579 ret = cpufreq_governor_start(policy, dbs_data);
580 break;
581 case CPUFREQ_GOV_STOP:
582 ret = cpufreq_governor_stop(policy, dbs_data);
583 break;
584 case CPUFREQ_GOV_LIMITS:
585 ret = cpufreq_governor_limits(policy, dbs_data);
586 break;
587 default:
588 ret = -EINVAL;
589 } 633 }
590 634 return -EINVAL;
591unlock:
592 mutex_unlock(&cdata->mutex);
593
594 return ret;
595} 635}
596EXPORT_SYMBOL_GPL(cpufreq_governor_dbs); 636EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 91e767a058a7..61ff82fe0613 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -18,6 +18,7 @@
18#define _CPUFREQ_GOVERNOR_H 18#define _CPUFREQ_GOVERNOR_H
19 19
20#include <linux/atomic.h> 20#include <linux/atomic.h>
21#include <linux/irq_work.h>
21#include <linux/cpufreq.h> 22#include <linux/cpufreq.h>
22#include <linux/kernel_stat.h> 23#include <linux/kernel_stat.h>
23#include <linux/module.h> 24#include <linux/module.h>
@@ -41,96 +42,68 @@
41enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE}; 42enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
42 43
43/* 44/*
44 * Macro for creating governors sysfs routines 45 * Abbreviations:
45 * 46 * dbs: used as a shortform for demand based switching It helps to keep variable
46 * - gov_sys: One governor instance per whole system 47 * names smaller, simpler
47 * - gov_pol: One governor instance per policy 48 * cdbs: common dbs
49 * od_*: On-demand governor
50 * cs_*: Conservative governor
48 */ 51 */
49 52
50/* Create attributes */ 53/* Governor demand based switching data (per-policy or global). */
51#define gov_sys_attr_ro(_name) \ 54struct dbs_data {
52static struct global_attr _name##_gov_sys = \ 55 int usage_count;
53__ATTR(_name, 0444, show_##_name##_gov_sys, NULL) 56 void *tuners;
54 57 unsigned int min_sampling_rate;
55#define gov_sys_attr_rw(_name) \ 58 unsigned int ignore_nice_load;
56static struct global_attr _name##_gov_sys = \ 59 unsigned int sampling_rate;
57__ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys) 60 unsigned int sampling_down_factor;
58 61 unsigned int up_threshold;
59#define gov_pol_attr_ro(_name) \ 62 unsigned int io_is_busy;
60static struct freq_attr _name##_gov_pol = \
61__ATTR(_name, 0444, show_##_name##_gov_pol, NULL)
62
63#define gov_pol_attr_rw(_name) \
64static struct freq_attr _name##_gov_pol = \
65__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
66 63
67#define gov_sys_pol_attr_rw(_name) \ 64 struct kobject kobj;
68 gov_sys_attr_rw(_name); \ 65 struct list_head policy_dbs_list;
69 gov_pol_attr_rw(_name) 66 /*
67 * Protect concurrent updates to governor tunables from sysfs,
68 * policy_dbs_list and usage_count.
69 */
70 struct mutex mutex;
71};
70 72
71#define gov_sys_pol_attr_ro(_name) \ 73/* Governor's specific attributes */
72 gov_sys_attr_ro(_name); \ 74struct dbs_data;
73 gov_pol_attr_ro(_name) 75struct governor_attr {
76 struct attribute attr;
77 ssize_t (*show)(struct dbs_data *dbs_data, char *buf);
78 ssize_t (*store)(struct dbs_data *dbs_data, const char *buf,
79 size_t count);
80};
74 81
75/* Create show/store routines */ 82#define gov_show_one(_gov, file_name) \
76#define show_one(_gov, file_name) \ 83static ssize_t show_##file_name \
77static ssize_t show_##file_name##_gov_sys \ 84(struct dbs_data *dbs_data, char *buf) \
78(struct kobject *kobj, struct attribute *attr, char *buf) \
79{ \ 85{ \
80 struct _gov##_dbs_tuners *tuners = _gov##_dbs_cdata.gdbs_data->tuners; \
81 return sprintf(buf, "%u\n", tuners->file_name); \
82} \
83 \
84static ssize_t show_##file_name##_gov_pol \
85(struct cpufreq_policy *policy, char *buf) \
86{ \
87 struct dbs_data *dbs_data = policy->governor_data; \
88 struct _gov##_dbs_tuners *tuners = dbs_data->tuners; \ 86 struct _gov##_dbs_tuners *tuners = dbs_data->tuners; \
89 return sprintf(buf, "%u\n", tuners->file_name); \ 87 return sprintf(buf, "%u\n", tuners->file_name); \
90} 88}
91 89
92#define store_one(_gov, file_name) \ 90#define gov_show_one_common(file_name) \
93static ssize_t store_##file_name##_gov_sys \ 91static ssize_t show_##file_name \
94(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \ 92(struct dbs_data *dbs_data, char *buf) \
95{ \
96 struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data; \
97 return store_##file_name(dbs_data, buf, count); \
98} \
99 \
100static ssize_t store_##file_name##_gov_pol \
101(struct cpufreq_policy *policy, const char *buf, size_t count) \
102{ \ 93{ \
103 struct dbs_data *dbs_data = policy->governor_data; \ 94 return sprintf(buf, "%u\n", dbs_data->file_name); \
104 return store_##file_name(dbs_data, buf, count); \
105} 95}
106 96
107#define show_store_one(_gov, file_name) \ 97#define gov_attr_ro(_name) \
108show_one(_gov, file_name); \ 98static struct governor_attr _name = \
109store_one(_gov, file_name) 99__ATTR(_name, 0444, show_##_name, NULL)
110 100
111/* create helper routines */ 101#define gov_attr_rw(_name) \
112#define define_get_cpu_dbs_routines(_dbs_info) \ 102static struct governor_attr _name = \
113static struct cpu_dbs_info *get_cpu_cdbs(int cpu) \ 103__ATTR(_name, 0644, show_##_name, store_##_name)
114{ \
115 return &per_cpu(_dbs_info, cpu).cdbs; \
116} \
117 \
118static void *get_cpu_dbs_info_s(int cpu) \
119{ \
120 return &per_cpu(_dbs_info, cpu); \
121}
122
123/*
124 * Abbreviations:
125 * dbs: used as a shortform for demand based switching It helps to keep variable
126 * names smaller, simpler
127 * cdbs: common dbs
128 * od_*: On-demand governor
129 * cs_*: Conservative governor
130 */
131 104
132/* Common to all CPUs of a policy */ 105/* Common to all CPUs of a policy */
133struct cpu_common_dbs_info { 106struct policy_dbs_info {
134 struct cpufreq_policy *policy; 107 struct cpufreq_policy *policy;
135 /* 108 /*
136 * Per policy mutex that serializes load evaluation from limit-change 109 * Per policy mutex that serializes load evaluation from limit-change
@@ -138,11 +111,27 @@ struct cpu_common_dbs_info {
138 */ 111 */
139 struct mutex timer_mutex; 112 struct mutex timer_mutex;
140 113
141 ktime_t time_stamp; 114 u64 last_sample_time;
142 atomic_t skip_work; 115 s64 sample_delay_ns;
116 atomic_t work_count;
117 struct irq_work irq_work;
143 struct work_struct work; 118 struct work_struct work;
119 /* dbs_data may be shared between multiple policy objects */
120 struct dbs_data *dbs_data;
121 struct list_head list;
122 /* Multiplier for increasing sample delay temporarily. */
123 unsigned int rate_mult;
124 /* Status indicators */
125 bool is_shared; /* This object is used by multiple CPUs */
126 bool work_in_progress; /* Work is being queued up or in progress */
144}; 127};
145 128
129static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,
130 unsigned int delay_us)
131{
132 policy_dbs->sample_delay_ns = delay_us * NSEC_PER_USEC;
133}
134
146/* Per cpu structures */ 135/* Per cpu structures */
147struct cpu_dbs_info { 136struct cpu_dbs_info {
148 u64 prev_cpu_idle; 137 u64 prev_cpu_idle;
@@ -155,54 +144,14 @@ struct cpu_dbs_info {
155 * wake-up from idle. 144 * wake-up from idle.
156 */ 145 */
157 unsigned int prev_load; 146 unsigned int prev_load;
158 struct timer_list timer; 147 struct update_util_data update_util;
159 struct cpu_common_dbs_info *shared; 148 struct policy_dbs_info *policy_dbs;
160};
161
162struct od_cpu_dbs_info_s {
163 struct cpu_dbs_info cdbs;
164 struct cpufreq_frequency_table *freq_table;
165 unsigned int freq_lo;
166 unsigned int freq_lo_jiffies;
167 unsigned int freq_hi_jiffies;
168 unsigned int rate_mult;
169 unsigned int sample_type:1;
170};
171
172struct cs_cpu_dbs_info_s {
173 struct cpu_dbs_info cdbs;
174 unsigned int down_skip;
175 unsigned int requested_freq;
176};
177
178/* Per policy Governors sysfs tunables */
179struct od_dbs_tuners {
180 unsigned int ignore_nice_load;
181 unsigned int sampling_rate;
182 unsigned int sampling_down_factor;
183 unsigned int up_threshold;
184 unsigned int powersave_bias;
185 unsigned int io_is_busy;
186};
187
188struct cs_dbs_tuners {
189 unsigned int ignore_nice_load;
190 unsigned int sampling_rate;
191 unsigned int sampling_down_factor;
192 unsigned int up_threshold;
193 unsigned int down_threshold;
194 unsigned int freq_step;
195}; 149};
196 150
197/* Common Governor data across policies */ 151/* Common Governor data across policies */
198struct dbs_data; 152struct dbs_governor {
199struct common_dbs_data { 153 struct cpufreq_governor gov;
200 /* Common across governors */ 154 struct kobj_type kobj_type;
201 #define GOV_ONDEMAND 0
202 #define GOV_CONSERVATIVE 1
203 int governor;
204 struct attribute_group *attr_group_gov_sys; /* one governor - system */
205 struct attribute_group *attr_group_gov_pol; /* one governor - policy */
206 155
207 /* 156 /*
208 * Common data for platforms that don't set 157 * Common data for platforms that don't set
@@ -210,74 +159,32 @@ struct common_dbs_data {
210 */ 159 */
211 struct dbs_data *gdbs_data; 160 struct dbs_data *gdbs_data;
212 161
213 struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu); 162 unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
214 void *(*get_cpu_dbs_info_s)(int cpu); 163 struct policy_dbs_info *(*alloc)(void);
215 unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy, 164 void (*free)(struct policy_dbs_info *policy_dbs);
216 bool modify_all);
217 void (*gov_check_cpu)(int cpu, unsigned int load);
218 int (*init)(struct dbs_data *dbs_data, bool notify); 165 int (*init)(struct dbs_data *dbs_data, bool notify);
219 void (*exit)(struct dbs_data *dbs_data, bool notify); 166 void (*exit)(struct dbs_data *dbs_data, bool notify);
220 167 void (*start)(struct cpufreq_policy *policy);
221 /* Governor specific ops, see below */
222 void *gov_ops;
223
224 /*
225 * Protects governor's data (struct dbs_data and struct common_dbs_data)
226 */
227 struct mutex mutex;
228}; 168};
229 169
230/* Governor Per policy data */ 170static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy)
231struct dbs_data { 171{
232 struct common_dbs_data *cdata; 172 return container_of(policy->governor, struct dbs_governor, gov);
233 unsigned int min_sampling_rate; 173}
234 int usage_count;
235 void *tuners;
236};
237 174
238/* Governor specific ops, will be passed to dbs_data->gov_ops */ 175/* Governor specific operations */
239struct od_ops { 176struct od_ops {
240 void (*powersave_bias_init_cpu)(int cpu);
241 unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy, 177 unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
242 unsigned int freq_next, unsigned int relation); 178 unsigned int freq_next, unsigned int relation);
243 void (*freq_increase)(struct cpufreq_policy *policy, unsigned int freq);
244}; 179};
245 180
246static inline int delay_for_sampling_rate(unsigned int sampling_rate) 181unsigned int dbs_update(struct cpufreq_policy *policy);
247{ 182int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
248 int delay = usecs_to_jiffies(sampling_rate);
249
250 /* We want all CPUs to do sampling nearly on same jiffy */
251 if (num_online_cpus() > 1)
252 delay -= jiffies % delay;
253
254 return delay;
255}
256
257#define declare_show_sampling_rate_min(_gov) \
258static ssize_t show_sampling_rate_min_gov_sys \
259(struct kobject *kobj, struct attribute *attr, char *buf) \
260{ \
261 struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data; \
262 return sprintf(buf, "%u\n", dbs_data->min_sampling_rate); \
263} \
264 \
265static ssize_t show_sampling_rate_min_gov_pol \
266(struct cpufreq_policy *policy, char *buf) \
267{ \
268 struct dbs_data *dbs_data = policy->governor_data; \
269 return sprintf(buf, "%u\n", dbs_data->min_sampling_rate); \
270}
271
272extern struct mutex cpufreq_governor_lock;
273
274void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
275void gov_cancel_work(struct cpu_common_dbs_info *shared);
276void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
277int cpufreq_governor_dbs(struct cpufreq_policy *policy,
278 struct common_dbs_data *cdata, unsigned int event);
279void od_register_powersave_bias_handler(unsigned int (*f) 183void od_register_powersave_bias_handler(unsigned int (*f)
280 (struct cpufreq_policy *, unsigned int, unsigned int), 184 (struct cpufreq_policy *, unsigned int, unsigned int),
281 unsigned int powersave_bias); 185 unsigned int powersave_bias);
282void od_unregister_powersave_bias_handler(void); 186void od_unregister_powersave_bias_handler(void);
187ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
188 size_t count);
189void gov_update_cpu_data(struct dbs_data *dbs_data);
283#endif /* _CPUFREQ_GOVERNOR_H */ 190#endif /* _CPUFREQ_GOVERNOR_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index eae51070c034..acd80272ded6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -16,7 +16,8 @@
16#include <linux/percpu-defs.h> 16#include <linux/percpu-defs.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/tick.h> 18#include <linux/tick.h>
19#include "cpufreq_governor.h" 19
20#include "cpufreq_ondemand.h"
20 21
21/* On-demand governor macros */ 22/* On-demand governor macros */
22#define DEF_FREQUENCY_UP_THRESHOLD (80) 23#define DEF_FREQUENCY_UP_THRESHOLD (80)
@@ -27,24 +28,10 @@
27#define MIN_FREQUENCY_UP_THRESHOLD (11) 28#define MIN_FREQUENCY_UP_THRESHOLD (11)
28#define MAX_FREQUENCY_UP_THRESHOLD (100) 29#define MAX_FREQUENCY_UP_THRESHOLD (100)
29 30
30static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
31
32static struct od_ops od_ops; 31static struct od_ops od_ops;
33 32
34#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
35static struct cpufreq_governor cpufreq_gov_ondemand;
36#endif
37
38static unsigned int default_powersave_bias; 33static unsigned int default_powersave_bias;
39 34
40static void ondemand_powersave_bias_init_cpu(int cpu)
41{
42 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
43
44 dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
45 dbs_info->freq_lo = 0;
46}
47
48/* 35/*
49 * Not all CPUs want IO time to be accounted as busy; this depends on how 36 * Not all CPUs want IO time to be accounted as busy; this depends on how
50 * efficient idling at a higher frequency/voltage is. 37 * efficient idling at a higher frequency/voltage is.
@@ -70,8 +57,8 @@ static int should_io_be_busy(void)
70 57
71/* 58/*
72 * Find right freq to be set now with powersave_bias on. 59 * Find right freq to be set now with powersave_bias on.
73 * Returns the freq_hi to be used right now and will set freq_hi_jiffies, 60 * Returns the freq_hi to be used right now and will set freq_hi_delay_us,
74 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs. 61 * freq_lo, and freq_lo_delay_us in percpu area for averaging freqs.
75 */ 62 */
76static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy, 63static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
77 unsigned int freq_next, unsigned int relation) 64 unsigned int freq_next, unsigned int relation)
@@ -79,15 +66,15 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
79 unsigned int freq_req, freq_reduc, freq_avg; 66 unsigned int freq_req, freq_reduc, freq_avg;
80 unsigned int freq_hi, freq_lo; 67 unsigned int freq_hi, freq_lo;
81 unsigned int index = 0; 68 unsigned int index = 0;
82 unsigned int jiffies_total, jiffies_hi, jiffies_lo; 69 unsigned int delay_hi_us;
83 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 70 struct policy_dbs_info *policy_dbs = policy->governor_data;
84 policy->cpu); 71 struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
85 struct dbs_data *dbs_data = policy->governor_data; 72 struct dbs_data *dbs_data = policy_dbs->dbs_data;
86 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 73 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
87 74
88 if (!dbs_info->freq_table) { 75 if (!dbs_info->freq_table) {
89 dbs_info->freq_lo = 0; 76 dbs_info->freq_lo = 0;
90 dbs_info->freq_lo_jiffies = 0; 77 dbs_info->freq_lo_delay_us = 0;
91 return freq_next; 78 return freq_next;
92 } 79 }
93 80
@@ -110,31 +97,30 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
110 /* Find out how long we have to be in hi and lo freqs */ 97 /* Find out how long we have to be in hi and lo freqs */
111 if (freq_hi == freq_lo) { 98 if (freq_hi == freq_lo) {
112 dbs_info->freq_lo = 0; 99 dbs_info->freq_lo = 0;
113 dbs_info->freq_lo_jiffies = 0; 100 dbs_info->freq_lo_delay_us = 0;
114 return freq_lo; 101 return freq_lo;
115 } 102 }
116 jiffies_total = usecs_to_jiffies(od_tuners->sampling_rate); 103 delay_hi_us = (freq_avg - freq_lo) * dbs_data->sampling_rate;
117 jiffies_hi = (freq_avg - freq_lo) * jiffies_total; 104 delay_hi_us += (freq_hi - freq_lo) / 2;
118 jiffies_hi += ((freq_hi - freq_lo) / 2); 105 delay_hi_us /= freq_hi - freq_lo;
119 jiffies_hi /= (freq_hi - freq_lo); 106 dbs_info->freq_hi_delay_us = delay_hi_us;
120 jiffies_lo = jiffies_total - jiffies_hi;
121 dbs_info->freq_lo = freq_lo; 107 dbs_info->freq_lo = freq_lo;
122 dbs_info->freq_lo_jiffies = jiffies_lo; 108 dbs_info->freq_lo_delay_us = dbs_data->sampling_rate - delay_hi_us;
123 dbs_info->freq_hi_jiffies = jiffies_hi;
124 return freq_hi; 109 return freq_hi;
125} 110}
126 111
127static void ondemand_powersave_bias_init(void) 112static void ondemand_powersave_bias_init(struct cpufreq_policy *policy)
128{ 113{
129 int i; 114 struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
130 for_each_online_cpu(i) { 115
131 ondemand_powersave_bias_init_cpu(i); 116 dbs_info->freq_table = cpufreq_frequency_get_table(policy->cpu);
132 } 117 dbs_info->freq_lo = 0;
133} 118}
134 119
135static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) 120static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
136{ 121{
137 struct dbs_data *dbs_data = policy->governor_data; 122 struct policy_dbs_info *policy_dbs = policy->governor_data;
123 struct dbs_data *dbs_data = policy_dbs->dbs_data;
138 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 124 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
139 125
140 if (od_tuners->powersave_bias) 126 if (od_tuners->powersave_bias)
@@ -152,21 +138,21 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
152 * (default), then we try to increase frequency. Else, we adjust the frequency 138 * (default), then we try to increase frequency. Else, we adjust the frequency
153 * proportional to load. 139 * proportional to load.
154 */ 140 */
155static void od_check_cpu(int cpu, unsigned int load) 141static void od_update(struct cpufreq_policy *policy)
156{ 142{
157 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 143 struct policy_dbs_info *policy_dbs = policy->governor_data;
158 struct cpufreq_policy *policy = dbs_info->cdbs.shared->policy; 144 struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
159 struct dbs_data *dbs_data = policy->governor_data; 145 struct dbs_data *dbs_data = policy_dbs->dbs_data;
160 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 146 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
147 unsigned int load = dbs_update(policy);
161 148
162 dbs_info->freq_lo = 0; 149 dbs_info->freq_lo = 0;
163 150
164 /* Check for frequency increase */ 151 /* Check for frequency increase */
165 if (load > od_tuners->up_threshold) { 152 if (load > dbs_data->up_threshold) {
166 /* If switching to max speed, apply sampling_down_factor */ 153 /* If switching to max speed, apply sampling_down_factor */
167 if (policy->cur < policy->max) 154 if (policy->cur < policy->max)
168 dbs_info->rate_mult = 155 policy_dbs->rate_mult = dbs_data->sampling_down_factor;
169 od_tuners->sampling_down_factor;
170 dbs_freq_increase(policy, policy->max); 156 dbs_freq_increase(policy, policy->max);
171 } else { 157 } else {
172 /* Calculate the next frequency proportional to load */ 158 /* Calculate the next frequency proportional to load */
@@ -177,177 +163,70 @@ static void od_check_cpu(int cpu, unsigned int load)
177 freq_next = min_f + load * (max_f - min_f) / 100; 163 freq_next = min_f + load * (max_f - min_f) / 100;
178 164
179 /* No longer fully busy, reset rate_mult */ 165 /* No longer fully busy, reset rate_mult */
180 dbs_info->rate_mult = 1; 166 policy_dbs->rate_mult = 1;
181 167
182 if (!od_tuners->powersave_bias) { 168 if (od_tuners->powersave_bias)
183 __cpufreq_driver_target(policy, freq_next, 169 freq_next = od_ops.powersave_bias_target(policy,
184 CPUFREQ_RELATION_C); 170 freq_next,
185 return; 171 CPUFREQ_RELATION_L);
186 }
187 172
188 freq_next = od_ops.powersave_bias_target(policy, freq_next,
189 CPUFREQ_RELATION_L);
190 __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C); 173 __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C);
191 } 174 }
192} 175}
193 176
194static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all) 177static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
195{ 178{
196 struct dbs_data *dbs_data = policy->governor_data; 179 struct policy_dbs_info *policy_dbs = policy->governor_data;
197 unsigned int cpu = policy->cpu; 180 struct dbs_data *dbs_data = policy_dbs->dbs_data;
198 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 181 struct od_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
199 cpu); 182 int sample_type = dbs_info->sample_type;
200 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
201 int delay = 0, sample_type = dbs_info->sample_type;
202
203 if (!modify_all)
204 goto max_delay;
205 183
206 /* Common NORMAL_SAMPLE setup */ 184 /* Common NORMAL_SAMPLE setup */
207 dbs_info->sample_type = OD_NORMAL_SAMPLE; 185 dbs_info->sample_type = OD_NORMAL_SAMPLE;
208 if (sample_type == OD_SUB_SAMPLE) { 186 /*
209 delay = dbs_info->freq_lo_jiffies; 187 * OD_SUB_SAMPLE doesn't make sense if sample_delay_ns is 0, so ignore
188 * it then.
189 */
190 if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) {
210 __cpufreq_driver_target(policy, dbs_info->freq_lo, 191 __cpufreq_driver_target(policy, dbs_info->freq_lo,
211 CPUFREQ_RELATION_H); 192 CPUFREQ_RELATION_H);
212 } else { 193 return dbs_info->freq_lo_delay_us;
213 dbs_check_cpu(dbs_data, cpu);
214 if (dbs_info->freq_lo) {
215 /* Setup timer for SUB_SAMPLE */
216 dbs_info->sample_type = OD_SUB_SAMPLE;
217 delay = dbs_info->freq_hi_jiffies;
218 }
219 } 194 }
220 195
221max_delay: 196 od_update(policy);
222 if (!delay)
223 delay = delay_for_sampling_rate(od_tuners->sampling_rate
224 * dbs_info->rate_mult);
225
226 return delay;
227}
228
229/************************** sysfs interface ************************/
230static struct common_dbs_data od_dbs_cdata;
231
232/**
233 * update_sampling_rate - update sampling rate effective immediately if needed.
234 * @new_rate: new sampling rate
235 *
236 * If new rate is smaller than the old, simply updating
237 * dbs_tuners_int.sampling_rate might not be appropriate. For example, if the
238 * original sampling_rate was 1 second and the requested new sampling rate is 10
239 * ms because the user needs immediate reaction from ondemand governor, but not
240 * sure if higher frequency will be required or not, then, the governor may
241 * change the sampling rate too late; up to 1 second later. Thus, if we are
242 * reducing the sampling rate, we need to make the new value effective
243 * immediately.
244 */
245static void update_sampling_rate(struct dbs_data *dbs_data,
246 unsigned int new_rate)
247{
248 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
249 struct cpumask cpumask;
250 int cpu;
251
252 od_tuners->sampling_rate = new_rate = max(new_rate,
253 dbs_data->min_sampling_rate);
254
255 /*
256 * Lock governor so that governor start/stop can't execute in parallel.
257 */
258 mutex_lock(&od_dbs_cdata.mutex);
259
260 cpumask_copy(&cpumask, cpu_online_mask);
261
262 for_each_cpu(cpu, &cpumask) {
263 struct cpufreq_policy *policy;
264 struct od_cpu_dbs_info_s *dbs_info;
265 struct cpu_dbs_info *cdbs;
266 struct cpu_common_dbs_info *shared;
267 unsigned long next_sampling, appointed_at;
268
269 dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
270 cdbs = &dbs_info->cdbs;
271 shared = cdbs->shared;
272
273 /*
274 * A valid shared and shared->policy means governor hasn't
275 * stopped or exited yet.
276 */
277 if (!shared || !shared->policy)
278 continue;
279
280 policy = shared->policy;
281
282 /* clear all CPUs of this policy */
283 cpumask_andnot(&cpumask, &cpumask, policy->cpus);
284 197
285 /* 198 if (dbs_info->freq_lo) {
286 * Update sampling rate for CPUs whose policy is governed by 199 /* Setup timer for SUB_SAMPLE */
287 * dbs_data. In case of governor_per_policy, only a single 200 dbs_info->sample_type = OD_SUB_SAMPLE;
288 * policy will be governed by dbs_data, otherwise there can be 201 return dbs_info->freq_hi_delay_us;
289 * multiple policies that are governed by the same dbs_data.
290 */
291 if (dbs_data != policy->governor_data)
292 continue;
293
294 /*
295 * Checking this for any CPU should be fine, timers for all of
296 * them are scheduled together.
297 */
298 next_sampling = jiffies + usecs_to_jiffies(new_rate);
299 appointed_at = dbs_info->cdbs.timer.expires;
300
301 if (time_before(next_sampling, appointed_at)) {
302 gov_cancel_work(shared);
303 gov_add_timers(policy, usecs_to_jiffies(new_rate));
304
305 }
306 } 202 }
307 203
308 mutex_unlock(&od_dbs_cdata.mutex); 204 return dbs_data->sampling_rate * policy_dbs->rate_mult;
309} 205}
310 206
311static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf, 207/************************** sysfs interface ************************/
312 size_t count) 208static struct dbs_governor od_dbs_gov;
313{
314 unsigned int input;
315 int ret;
316 ret = sscanf(buf, "%u", &input);
317 if (ret != 1)
318 return -EINVAL;
319
320 update_sampling_rate(dbs_data, input);
321 return count;
322}
323 209
324static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf, 210static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
325 size_t count) 211 size_t count)
326{ 212{
327 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
328 unsigned int input; 213 unsigned int input;
329 int ret; 214 int ret;
330 unsigned int j;
331 215
332 ret = sscanf(buf, "%u", &input); 216 ret = sscanf(buf, "%u", &input);
333 if (ret != 1) 217 if (ret != 1)
334 return -EINVAL; 218 return -EINVAL;
335 od_tuners->io_is_busy = !!input; 219 dbs_data->io_is_busy = !!input;
336 220
337 /* we need to re-evaluate prev_cpu_idle */ 221 /* we need to re-evaluate prev_cpu_idle */
338 for_each_online_cpu(j) { 222 gov_update_cpu_data(dbs_data);
339 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 223
340 j);
341 dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
342 &dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
343 }
344 return count; 224 return count;
345} 225}
346 226
347static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf, 227static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
348 size_t count) 228 size_t count)
349{ 229{
350 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
351 unsigned int input; 230 unsigned int input;
352 int ret; 231 int ret;
353 ret = sscanf(buf, "%u", &input); 232 ret = sscanf(buf, "%u", &input);
@@ -357,40 +236,43 @@ static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
357 return -EINVAL; 236 return -EINVAL;
358 } 237 }
359 238
360 od_tuners->up_threshold = input; 239 dbs_data->up_threshold = input;
361 return count; 240 return count;
362} 241}
363 242
364static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data, 243static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
365 const char *buf, size_t count) 244 const char *buf, size_t count)
366{ 245{
367 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 246 struct policy_dbs_info *policy_dbs;
368 unsigned int input, j; 247 unsigned int input;
369 int ret; 248 int ret;
370 ret = sscanf(buf, "%u", &input); 249 ret = sscanf(buf, "%u", &input);
371 250
372 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 251 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
373 return -EINVAL; 252 return -EINVAL;
374 od_tuners->sampling_down_factor = input; 253
254 dbs_data->sampling_down_factor = input;
375 255
376 /* Reset down sampling multiplier in case it was active */ 256 /* Reset down sampling multiplier in case it was active */
377 for_each_online_cpu(j) { 257 list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
378 struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, 258 /*
379 j); 259 * Doing this without locking might lead to using different
380 dbs_info->rate_mult = 1; 260 * rate_mult values in od_update() and od_dbs_timer().
261 */
262 mutex_lock(&policy_dbs->timer_mutex);
263 policy_dbs->rate_mult = 1;
264 mutex_unlock(&policy_dbs->timer_mutex);
381 } 265 }
266
382 return count; 267 return count;
383} 268}
384 269
385static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data, 270static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
386 const char *buf, size_t count) 271 const char *buf, size_t count)
387{ 272{
388 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
389 unsigned int input; 273 unsigned int input;
390 int ret; 274 int ret;
391 275
392 unsigned int j;
393
394 ret = sscanf(buf, "%u", &input); 276 ret = sscanf(buf, "%u", &input);
395 if (ret != 1) 277 if (ret != 1)
396 return -EINVAL; 278 return -EINVAL;
@@ -398,22 +280,14 @@ static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
398 if (input > 1) 280 if (input > 1)
399 input = 1; 281 input = 1;
400 282
401 if (input == od_tuners->ignore_nice_load) { /* nothing to do */ 283 if (input == dbs_data->ignore_nice_load) { /* nothing to do */
402 return count; 284 return count;
403 } 285 }
404 od_tuners->ignore_nice_load = input; 286 dbs_data->ignore_nice_load = input;
405 287
406 /* we need to re-evaluate prev_cpu_idle */ 288 /* we need to re-evaluate prev_cpu_idle */
407 for_each_online_cpu(j) { 289 gov_update_cpu_data(dbs_data);
408 struct od_cpu_dbs_info_s *dbs_info;
409 dbs_info = &per_cpu(od_cpu_dbs_info, j);
410 dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
411 &dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
412 if (od_tuners->ignore_nice_load)
413 dbs_info->cdbs.prev_cpu_nice =
414 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
415 290
416 }
417 return count; 291 return count;
418} 292}
419 293
@@ -421,6 +295,7 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
421 size_t count) 295 size_t count)
422{ 296{
423 struct od_dbs_tuners *od_tuners = dbs_data->tuners; 297 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
298 struct policy_dbs_info *policy_dbs;
424 unsigned int input; 299 unsigned int input;
425 int ret; 300 int ret;
426 ret = sscanf(buf, "%u", &input); 301 ret = sscanf(buf, "%u", &input);
@@ -432,59 +307,54 @@ static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
432 input = 1000; 307 input = 1000;
433 308
434 od_tuners->powersave_bias = input; 309 od_tuners->powersave_bias = input;
435 ondemand_powersave_bias_init(); 310
311 list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list)
312 ondemand_powersave_bias_init(policy_dbs->policy);
313
436 return count; 314 return count;
437} 315}
438 316
439show_store_one(od, sampling_rate); 317gov_show_one_common(sampling_rate);
440show_store_one(od, io_is_busy); 318gov_show_one_common(up_threshold);
441show_store_one(od, up_threshold); 319gov_show_one_common(sampling_down_factor);
442show_store_one(od, sampling_down_factor); 320gov_show_one_common(ignore_nice_load);
443show_store_one(od, ignore_nice_load); 321gov_show_one_common(min_sampling_rate);
444show_store_one(od, powersave_bias); 322gov_show_one_common(io_is_busy);
445declare_show_sampling_rate_min(od); 323gov_show_one(od, powersave_bias);
446 324
447gov_sys_pol_attr_rw(sampling_rate); 325gov_attr_rw(sampling_rate);
448gov_sys_pol_attr_rw(io_is_busy); 326gov_attr_rw(io_is_busy);
449gov_sys_pol_attr_rw(up_threshold); 327gov_attr_rw(up_threshold);
450gov_sys_pol_attr_rw(sampling_down_factor); 328gov_attr_rw(sampling_down_factor);
451gov_sys_pol_attr_rw(ignore_nice_load); 329gov_attr_rw(ignore_nice_load);
452gov_sys_pol_attr_rw(powersave_bias); 330gov_attr_rw(powersave_bias);
453gov_sys_pol_attr_ro(sampling_rate_min); 331gov_attr_ro(min_sampling_rate);
454 332
455static struct attribute *dbs_attributes_gov_sys[] = { 333static struct attribute *od_attributes[] = {
456 &sampling_rate_min_gov_sys.attr, 334 &min_sampling_rate.attr,
457 &sampling_rate_gov_sys.attr, 335 &sampling_rate.attr,
458 &up_threshold_gov_sys.attr, 336 &up_threshold.attr,
459 &sampling_down_factor_gov_sys.attr, 337 &sampling_down_factor.attr,
460 &ignore_nice_load_gov_sys.attr, 338 &ignore_nice_load.attr,
461 &powersave_bias_gov_sys.attr, 339 &powersave_bias.attr,
462 &io_is_busy_gov_sys.attr, 340 &io_is_busy.attr,
463 NULL 341 NULL
464}; 342};
465 343
466static struct attribute_group od_attr_group_gov_sys = { 344/************************** sysfs end ************************/
467 .attrs = dbs_attributes_gov_sys,
468 .name = "ondemand",
469};
470 345
471static struct attribute *dbs_attributes_gov_pol[] = { 346static struct policy_dbs_info *od_alloc(void)
472 &sampling_rate_min_gov_pol.attr, 347{
473 &sampling_rate_gov_pol.attr, 348 struct od_policy_dbs_info *dbs_info;
474 &up_threshold_gov_pol.attr,
475 &sampling_down_factor_gov_pol.attr,
476 &ignore_nice_load_gov_pol.attr,
477 &powersave_bias_gov_pol.attr,
478 &io_is_busy_gov_pol.attr,
479 NULL
480};
481 349
482static struct attribute_group od_attr_group_gov_pol = { 350 dbs_info = kzalloc(sizeof(*dbs_info), GFP_KERNEL);
483 .attrs = dbs_attributes_gov_pol, 351 return dbs_info ? &dbs_info->policy_dbs : NULL;
484 .name = "ondemand", 352}
485};
486 353
487/************************** sysfs end ************************/ 354static void od_free(struct policy_dbs_info *policy_dbs)
355{
356 kfree(to_dbs_info(policy_dbs));
357}
488 358
489static int od_init(struct dbs_data *dbs_data, bool notify) 359static int od_init(struct dbs_data *dbs_data, bool notify)
490{ 360{
@@ -503,7 +373,7 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
503 put_cpu(); 373 put_cpu();
504 if (idle_time != -1ULL) { 374 if (idle_time != -1ULL) {
505 /* Idle micro accounting is supported. Use finer thresholds */ 375 /* Idle micro accounting is supported. Use finer thresholds */
506 tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; 376 dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
507 /* 377 /*
508 * In nohz/micro accounting case we set the minimum frequency 378 * In nohz/micro accounting case we set the minimum frequency
509 * not depending on HZ, but fixed (very low). The deferred 379 * not depending on HZ, but fixed (very low). The deferred
@@ -511,17 +381,17 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
511 */ 381 */
512 dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; 382 dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
513 } else { 383 } else {
514 tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD; 384 dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
515 385
516 /* For correct statistics, we need 10 ticks for each measure */ 386 /* For correct statistics, we need 10 ticks for each measure */
517 dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO * 387 dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
518 jiffies_to_usecs(10); 388 jiffies_to_usecs(10);
519 } 389 }
520 390
521 tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR; 391 dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
522 tuners->ignore_nice_load = 0; 392 dbs_data->ignore_nice_load = 0;
523 tuners->powersave_bias = default_powersave_bias; 393 tuners->powersave_bias = default_powersave_bias;
524 tuners->io_is_busy = should_io_be_busy(); 394 dbs_data->io_is_busy = should_io_be_busy();
525 395
526 dbs_data->tuners = tuners; 396 dbs_data->tuners = tuners;
527 return 0; 397 return 0;
@@ -532,33 +402,38 @@ static void od_exit(struct dbs_data *dbs_data, bool notify)
532 kfree(dbs_data->tuners); 402 kfree(dbs_data->tuners);
533} 403}
534 404
535define_get_cpu_dbs_routines(od_cpu_dbs_info); 405static void od_start(struct cpufreq_policy *policy)
406{
407 struct od_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data);
408
409 dbs_info->sample_type = OD_NORMAL_SAMPLE;
410 ondemand_powersave_bias_init(policy);
411}
536 412
537static struct od_ops od_ops = { 413static struct od_ops od_ops = {
538 .powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
539 .powersave_bias_target = generic_powersave_bias_target, 414 .powersave_bias_target = generic_powersave_bias_target,
540 .freq_increase = dbs_freq_increase,
541}; 415};
542 416
543static struct common_dbs_data od_dbs_cdata = { 417static struct dbs_governor od_dbs_gov = {
544 .governor = GOV_ONDEMAND, 418 .gov = {
545 .attr_group_gov_sys = &od_attr_group_gov_sys, 419 .name = "ondemand",
546 .attr_group_gov_pol = &od_attr_group_gov_pol, 420 .governor = cpufreq_governor_dbs,
547 .get_cpu_cdbs = get_cpu_cdbs, 421 .max_transition_latency = TRANSITION_LATENCY_LIMIT,
548 .get_cpu_dbs_info_s = get_cpu_dbs_info_s, 422 .owner = THIS_MODULE,
423 },
424 .kobj_type = { .default_attrs = od_attributes },
549 .gov_dbs_timer = od_dbs_timer, 425 .gov_dbs_timer = od_dbs_timer,
550 .gov_check_cpu = od_check_cpu, 426 .alloc = od_alloc,
551 .gov_ops = &od_ops, 427 .free = od_free,
552 .init = od_init, 428 .init = od_init,
553 .exit = od_exit, 429 .exit = od_exit,
554 .mutex = __MUTEX_INITIALIZER(od_dbs_cdata.mutex), 430 .start = od_start,
555}; 431};
556 432
433#define CPU_FREQ_GOV_ONDEMAND (&od_dbs_gov.gov)
434
557static void od_set_powersave_bias(unsigned int powersave_bias) 435static void od_set_powersave_bias(unsigned int powersave_bias)
558{ 436{
559 struct cpufreq_policy *policy;
560 struct dbs_data *dbs_data;
561 struct od_dbs_tuners *od_tuners;
562 unsigned int cpu; 437 unsigned int cpu;
563 cpumask_t done; 438 cpumask_t done;
564 439
@@ -567,22 +442,25 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
567 442
568 get_online_cpus(); 443 get_online_cpus();
569 for_each_online_cpu(cpu) { 444 for_each_online_cpu(cpu) {
570 struct cpu_common_dbs_info *shared; 445 struct cpufreq_policy *policy;
446 struct policy_dbs_info *policy_dbs;
447 struct dbs_data *dbs_data;
448 struct od_dbs_tuners *od_tuners;
571 449
572 if (cpumask_test_cpu(cpu, &done)) 450 if (cpumask_test_cpu(cpu, &done))
573 continue; 451 continue;
574 452
575 shared = per_cpu(od_cpu_dbs_info, cpu).cdbs.shared; 453 policy = cpufreq_cpu_get_raw(cpu);
576 if (!shared) 454 if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND)
577 continue; 455 continue;
578 456
579 policy = shared->policy; 457 policy_dbs = policy->governor_data;
580 cpumask_or(&done, &done, policy->cpus); 458 if (!policy_dbs)
581
582 if (policy->governor != &cpufreq_gov_ondemand)
583 continue; 459 continue;
584 460
585 dbs_data = policy->governor_data; 461 cpumask_or(&done, &done, policy->cpus);
462
463 dbs_data = policy_dbs->dbs_data;
586 od_tuners = dbs_data->tuners; 464 od_tuners = dbs_data->tuners;
587 od_tuners->powersave_bias = default_powersave_bias; 465 od_tuners->powersave_bias = default_powersave_bias;
588 } 466 }
@@ -605,30 +483,14 @@ void od_unregister_powersave_bias_handler(void)
605} 483}
606EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler); 484EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
607 485
608static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
609 unsigned int event)
610{
611 return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
612}
613
614#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
615static
616#endif
617struct cpufreq_governor cpufreq_gov_ondemand = {
618 .name = "ondemand",
619 .governor = od_cpufreq_governor_dbs,
620 .max_transition_latency = TRANSITION_LATENCY_LIMIT,
621 .owner = THIS_MODULE,
622};
623
624static int __init cpufreq_gov_dbs_init(void) 486static int __init cpufreq_gov_dbs_init(void)
625{ 487{
626 return cpufreq_register_governor(&cpufreq_gov_ondemand); 488 return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND);
627} 489}
628 490
629static void __exit cpufreq_gov_dbs_exit(void) 491static void __exit cpufreq_gov_dbs_exit(void)
630{ 492{
631 cpufreq_unregister_governor(&cpufreq_gov_ondemand); 493 cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND);
632} 494}
633 495
634MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>"); 496MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
@@ -638,6 +500,11 @@ MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
638MODULE_LICENSE("GPL"); 500MODULE_LICENSE("GPL");
639 501
640#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND 502#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
503struct cpufreq_governor *cpufreq_default_governor(void)
504{
505 return CPU_FREQ_GOV_ONDEMAND;
506}
507
641fs_initcall(cpufreq_gov_dbs_init); 508fs_initcall(cpufreq_gov_dbs_init);
642#else 509#else
643module_init(cpufreq_gov_dbs_init); 510module_init(cpufreq_gov_dbs_init);
diff --git a/drivers/cpufreq/cpufreq_ondemand.h b/drivers/cpufreq/cpufreq_ondemand.h
new file mode 100644
index 000000000000..f0121db3cd9e
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_ondemand.h
@@ -0,0 +1,30 @@
1/*
2 * Header file for CPUFreq ondemand governor and related code.
3 *
4 * Copyright (C) 2016, Intel Corporation
5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include "cpufreq_governor.h"
13
14struct od_policy_dbs_info {
15 struct policy_dbs_info policy_dbs;
16 struct cpufreq_frequency_table *freq_table;
17 unsigned int freq_lo;
18 unsigned int freq_lo_delay_us;
19 unsigned int freq_hi_delay_us;
20 unsigned int sample_type:1;
21};
22
23static inline struct od_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs)
24{
25 return container_of(policy_dbs, struct od_policy_dbs_info, policy_dbs);
26}
27
28struct od_dbs_tuners {
29 unsigned int powersave_bias;
30};
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index cf117deb39b1..af9f4b96f5a8 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -33,10 +33,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy,
33 return 0; 33 return 0;
34} 34}
35 35
36#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE 36static struct cpufreq_governor cpufreq_gov_performance = {
37static
38#endif
39struct cpufreq_governor cpufreq_gov_performance = {
40 .name = "performance", 37 .name = "performance",
41 .governor = cpufreq_governor_performance, 38 .governor = cpufreq_governor_performance,
42 .owner = THIS_MODULE, 39 .owner = THIS_MODULE,
@@ -52,6 +49,19 @@ static void __exit cpufreq_gov_performance_exit(void)
52 cpufreq_unregister_governor(&cpufreq_gov_performance); 49 cpufreq_unregister_governor(&cpufreq_gov_performance);
53} 50}
54 51
52#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
53struct cpufreq_governor *cpufreq_default_governor(void)
54{
55 return &cpufreq_gov_performance;
56}
57#endif
58#ifndef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
59struct cpufreq_governor *cpufreq_fallback_governor(void)
60{
61 return &cpufreq_gov_performance;
62}
63#endif
64
55MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); 65MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
56MODULE_DESCRIPTION("CPUfreq policy governor 'performance'"); 66MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
57MODULE_LICENSE("GPL"); 67MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index e3b874c235ea..b8b400232a74 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -33,10 +33,7 @@ static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
33 return 0; 33 return 0;
34} 34}
35 35
36#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE 36static struct cpufreq_governor cpufreq_gov_powersave = {
37static
38#endif
39struct cpufreq_governor cpufreq_gov_powersave = {
40 .name = "powersave", 37 .name = "powersave",
41 .governor = cpufreq_governor_powersave, 38 .governor = cpufreq_governor_powersave,
42 .owner = THIS_MODULE, 39 .owner = THIS_MODULE,
@@ -57,6 +54,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
57MODULE_LICENSE("GPL"); 54MODULE_LICENSE("GPL");
58 55
59#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE 56#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
57struct cpufreq_governor *cpufreq_default_governor(void)
58{
59 return &cpufreq_gov_powersave;
60}
61
60fs_initcall(cpufreq_gov_powersave_init); 62fs_initcall(cpufreq_gov_powersave_init);
61#else 63#else
62module_init(cpufreq_gov_powersave_init); 64module_init(cpufreq_gov_powersave_init);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 4dbf1db16aca..4d16f45ee1da 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -89,10 +89,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
89 return rc; 89 return rc;
90} 90}
91 91
92#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE 92static struct cpufreq_governor cpufreq_gov_userspace = {
93static
94#endif
95struct cpufreq_governor cpufreq_gov_userspace = {
96 .name = "userspace", 93 .name = "userspace",
97 .governor = cpufreq_governor_userspace, 94 .governor = cpufreq_governor_userspace,
98 .store_setspeed = cpufreq_set, 95 .store_setspeed = cpufreq_set,
@@ -116,6 +113,11 @@ MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
116MODULE_LICENSE("GPL"); 113MODULE_LICENSE("GPL");
117 114
118#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE 115#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
116struct cpufreq_governor *cpufreq_default_governor(void)
117{
118 return &cpufreq_gov_userspace;
119}
120
119fs_initcall(cpufreq_gov_userspace_init); 121fs_initcall(cpufreq_gov_userspace_init);
120#else 122#else
121module_init(cpufreq_gov_userspace_init); 123module_init(cpufreq_gov_userspace_init);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd83d477e32d..cb5607495816 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -71,7 +71,7 @@ struct sample {
71 u64 mperf; 71 u64 mperf;
72 u64 tsc; 72 u64 tsc;
73 int freq; 73 int freq;
74 ktime_t time; 74 u64 time;
75}; 75};
76 76
77struct pstate_data { 77struct pstate_data {
@@ -103,13 +103,13 @@ struct _pid {
103struct cpudata { 103struct cpudata {
104 int cpu; 104 int cpu;
105 105
106 struct timer_list timer; 106 struct update_util_data update_util;
107 107
108 struct pstate_data pstate; 108 struct pstate_data pstate;
109 struct vid_data vid; 109 struct vid_data vid;
110 struct _pid pid; 110 struct _pid pid;
111 111
112 ktime_t last_sample_time; 112 u64 last_sample_time;
113 u64 prev_aperf; 113 u64 prev_aperf;
114 u64 prev_mperf; 114 u64 prev_mperf;
115 u64 prev_tsc; 115 u64 prev_tsc;
@@ -120,6 +120,7 @@ struct cpudata {
120static struct cpudata **all_cpu_data; 120static struct cpudata **all_cpu_data;
121struct pstate_adjust_policy { 121struct pstate_adjust_policy {
122 int sample_rate_ms; 122 int sample_rate_ms;
123 s64 sample_rate_ns;
123 int deadband; 124 int deadband;
124 int setpoint; 125 int setpoint;
125 int p_gain_pct; 126 int p_gain_pct;
@@ -197,8 +198,8 @@ static struct perf_limits *limits = &powersave_limits;
197 198
198static inline void pid_reset(struct _pid *pid, int setpoint, int busy, 199static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
199 int deadband, int integral) { 200 int deadband, int integral) {
200 pid->setpoint = setpoint; 201 pid->setpoint = int_tofp(setpoint);
201 pid->deadband = deadband; 202 pid->deadband = int_tofp(deadband);
202 pid->integral = int_tofp(integral); 203 pid->integral = int_tofp(integral);
203 pid->last_err = int_tofp(setpoint) - int_tofp(busy); 204 pid->last_err = int_tofp(setpoint) - int_tofp(busy);
204} 205}
@@ -224,9 +225,9 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)
224 int32_t pterm, dterm, fp_error; 225 int32_t pterm, dterm, fp_error;
225 int32_t integral_limit; 226 int32_t integral_limit;
226 227
227 fp_error = int_tofp(pid->setpoint) - busy; 228 fp_error = pid->setpoint - busy;
228 229
229 if (abs(fp_error) <= int_tofp(pid->deadband)) 230 if (abs(fp_error) <= pid->deadband)
230 return 0; 231 return 0;
231 232
232 pterm = mul_fp(pid->p_gain, fp_error); 233 pterm = mul_fp(pid->p_gain, fp_error);
@@ -286,7 +287,7 @@ static inline void update_turbo_state(void)
286 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); 287 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
287} 288}
288 289
289static void intel_pstate_hwp_set(void) 290static void intel_pstate_hwp_set(const struct cpumask *cpumask)
290{ 291{
291 int min, hw_min, max, hw_max, cpu, range, adj_range; 292 int min, hw_min, max, hw_max, cpu, range, adj_range;
292 u64 value, cap; 293 u64 value, cap;
@@ -296,9 +297,7 @@ static void intel_pstate_hwp_set(void)
296 hw_max = HWP_HIGHEST_PERF(cap); 297 hw_max = HWP_HIGHEST_PERF(cap);
297 range = hw_max - hw_min; 298 range = hw_max - hw_min;
298 299
299 get_online_cpus(); 300 for_each_cpu(cpu, cpumask) {
300
301 for_each_online_cpu(cpu) {
302 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); 301 rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
303 adj_range = limits->min_perf_pct * range / 100; 302 adj_range = limits->min_perf_pct * range / 100;
304 min = hw_min + adj_range; 303 min = hw_min + adj_range;
@@ -317,7 +316,12 @@ static void intel_pstate_hwp_set(void)
317 value |= HWP_MAX_PERF(max); 316 value |= HWP_MAX_PERF(max);
318 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); 317 wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
319 } 318 }
319}
320 320
321static void intel_pstate_hwp_set_online_cpus(void)
322{
323 get_online_cpus();
324 intel_pstate_hwp_set(cpu_online_mask);
321 put_online_cpus(); 325 put_online_cpus();
322} 326}
323 327
@@ -439,7 +443,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
439 limits->no_turbo = clamp_t(int, input, 0, 1); 443 limits->no_turbo = clamp_t(int, input, 0, 1);
440 444
441 if (hwp_active) 445 if (hwp_active)
442 intel_pstate_hwp_set(); 446 intel_pstate_hwp_set_online_cpus();
443 447
444 return count; 448 return count;
445} 449}
@@ -465,7 +469,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
465 int_tofp(100)); 469 int_tofp(100));
466 470
467 if (hwp_active) 471 if (hwp_active)
468 intel_pstate_hwp_set(); 472 intel_pstate_hwp_set_online_cpus();
469 return count; 473 return count;
470} 474}
471 475
@@ -490,7 +494,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
490 int_tofp(100)); 494 int_tofp(100));
491 495
492 if (hwp_active) 496 if (hwp_active)
493 intel_pstate_hwp_set(); 497 intel_pstate_hwp_set_online_cpus();
494 return count; 498 return count;
495} 499}
496 500
@@ -531,6 +535,9 @@ static void __init intel_pstate_sysfs_expose_params(void)
531 535
532static void intel_pstate_hwp_enable(struct cpudata *cpudata) 536static void intel_pstate_hwp_enable(struct cpudata *cpudata)
533{ 537{
538 /* First disable HWP notification interrupt as we don't process them */
539 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
540
534 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); 541 wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
535} 542}
536 543
@@ -712,7 +719,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate)
712 if (limits->no_turbo && !limits->turbo_disabled) 719 if (limits->no_turbo && !limits->turbo_disabled)
713 val |= (u64)1 << 32; 720 val |= (u64)1 << 32;
714 721
715 wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val); 722 wrmsrl(MSR_IA32_PERF_CTL, val);
716} 723}
717 724
718static int knl_get_turbo_pstate(void) 725static int knl_get_turbo_pstate(void)
@@ -824,11 +831,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
824 * policy, or by cpu specific default values determined through 831 * policy, or by cpu specific default values determined through
825 * experimentation. 832 * experimentation.
826 */ 833 */
827 max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits->max_perf)); 834 max_perf_adj = fp_toint(max_perf * limits->max_perf);
828 *max = clamp_t(int, max_perf_adj, 835 *max = clamp_t(int, max_perf_adj,
829 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); 836 cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
830 837
831 min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits->min_perf)); 838 min_perf = fp_toint(max_perf * limits->min_perf);
832 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); 839 *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
833} 840}
834 841
@@ -874,16 +881,10 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu)
874 core_pct = int_tofp(sample->aperf) * int_tofp(100); 881 core_pct = int_tofp(sample->aperf) * int_tofp(100);
875 core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); 882 core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
876 883
877 sample->freq = fp_toint(
878 mul_fp(int_tofp(
879 cpu->pstate.max_pstate_physical *
880 cpu->pstate.scaling / 100),
881 core_pct));
882
883 sample->core_pct_busy = (int32_t)core_pct; 884 sample->core_pct_busy = (int32_t)core_pct;
884} 885}
885 886
886static inline void intel_pstate_sample(struct cpudata *cpu) 887static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
887{ 888{
888 u64 aperf, mperf; 889 u64 aperf, mperf;
889 unsigned long flags; 890 unsigned long flags;
@@ -893,14 +894,14 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
893 rdmsrl(MSR_IA32_APERF, aperf); 894 rdmsrl(MSR_IA32_APERF, aperf);
894 rdmsrl(MSR_IA32_MPERF, mperf); 895 rdmsrl(MSR_IA32_MPERF, mperf);
895 tsc = rdtsc(); 896 tsc = rdtsc();
896 if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) { 897 if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
897 local_irq_restore(flags); 898 local_irq_restore(flags);
898 return; 899 return false;
899 } 900 }
900 local_irq_restore(flags); 901 local_irq_restore(flags);
901 902
902 cpu->last_sample_time = cpu->sample.time; 903 cpu->last_sample_time = cpu->sample.time;
903 cpu->sample.time = ktime_get(); 904 cpu->sample.time = time;
904 cpu->sample.aperf = aperf; 905 cpu->sample.aperf = aperf;
905 cpu->sample.mperf = mperf; 906 cpu->sample.mperf = mperf;
906 cpu->sample.tsc = tsc; 907 cpu->sample.tsc = tsc;
@@ -908,27 +909,16 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
908 cpu->sample.mperf -= cpu->prev_mperf; 909 cpu->sample.mperf -= cpu->prev_mperf;
909 cpu->sample.tsc -= cpu->prev_tsc; 910 cpu->sample.tsc -= cpu->prev_tsc;
910 911
911 intel_pstate_calc_busy(cpu);
912
913 cpu->prev_aperf = aperf; 912 cpu->prev_aperf = aperf;
914 cpu->prev_mperf = mperf; 913 cpu->prev_mperf = mperf;
915 cpu->prev_tsc = tsc; 914 cpu->prev_tsc = tsc;
915 return true;
916} 916}
917 917
918static inline void intel_hwp_set_sample_time(struct cpudata *cpu) 918static inline int32_t get_avg_frequency(struct cpudata *cpu)
919{
920 int delay;
921
922 delay = msecs_to_jiffies(50);
923 mod_timer_pinned(&cpu->timer, jiffies + delay);
924}
925
926static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
927{ 919{
928 int delay; 920 return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf *
929 921 cpu->pstate.scaling, cpu->sample.mperf);
930 delay = msecs_to_jiffies(pid_params.sample_rate_ms);
931 mod_timer_pinned(&cpu->timer, jiffies + delay);
932} 922}
933 923
934static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) 924static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
@@ -954,7 +944,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
954 mperf = cpu->sample.mperf + delta_iowait_mperf; 944 mperf = cpu->sample.mperf + delta_iowait_mperf;
955 cpu->prev_cummulative_iowait = cummulative_iowait; 945 cpu->prev_cummulative_iowait = cummulative_iowait;
956 946
957
958 /* 947 /*
959 * The load can be estimated as the ratio of the mperf counter 948 * The load can be estimated as the ratio of the mperf counter
960 * running at a constant frequency during active periods 949 * running at a constant frequency during active periods
@@ -970,8 +959,9 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
970static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) 959static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
971{ 960{
972 int32_t core_busy, max_pstate, current_pstate, sample_ratio; 961 int32_t core_busy, max_pstate, current_pstate, sample_ratio;
973 s64 duration_us; 962 u64 duration_ns;
974 u32 sample_time; 963
964 intel_pstate_calc_busy(cpu);
975 965
976 /* 966 /*
977 * core_busy is the ratio of actual performance to max 967 * core_busy is the ratio of actual performance to max
@@ -990,18 +980,16 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
990 core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); 980 core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
991 981
992 /* 982 /*
993 * Since we have a deferred timer, it will not fire unless 983 * Since our utilization update callback will not run unless we are
994 * we are in C0. So, determine if the actual elapsed time 984 * in C0, check if the actual elapsed time is significantly greater (3x)
995 * is significantly greater (3x) than our sample interval. If it 985 * than our sample interval. If it is, then we were idle for a long
996 * is, then we were idle for a long enough period of time 986 * enough period of time to adjust our busyness.
997 * to adjust our busyness.
998 */ 987 */
999 sample_time = pid_params.sample_rate_ms * USEC_PER_MSEC; 988 duration_ns = cpu->sample.time - cpu->last_sample_time;
1000 duration_us = ktime_us_delta(cpu->sample.time, 989 if ((s64)duration_ns > pid_params.sample_rate_ns * 3
1001 cpu->last_sample_time); 990 && cpu->last_sample_time > 0) {
1002 if (duration_us > sample_time * 3) { 991 sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
1003 sample_ratio = div_fp(int_tofp(sample_time), 992 int_tofp(duration_ns));
1004 int_tofp(duration_us));
1005 core_busy = mul_fp(core_busy, sample_ratio); 993 core_busy = mul_fp(core_busy, sample_ratio);
1006 } 994 }
1007 995
@@ -1028,26 +1016,21 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
1028 sample->mperf, 1016 sample->mperf,
1029 sample->aperf, 1017 sample->aperf,
1030 sample->tsc, 1018 sample->tsc,
1031 sample->freq); 1019 get_avg_frequency(cpu));
1032} 1020}
1033 1021
1034static void intel_hwp_timer_func(unsigned long __data) 1022static void intel_pstate_update_util(struct update_util_data *data, u64 time,
1023 unsigned long util, unsigned long max)
1035{ 1024{
1036 struct cpudata *cpu = (struct cpudata *) __data; 1025 struct cpudata *cpu = container_of(data, struct cpudata, update_util);
1026 u64 delta_ns = time - cpu->sample.time;
1037 1027
1038 intel_pstate_sample(cpu); 1028 if ((s64)delta_ns >= pid_params.sample_rate_ns) {
1039 intel_hwp_set_sample_time(cpu); 1029 bool sample_taken = intel_pstate_sample(cpu, time);
1040}
1041 1030
1042static void intel_pstate_timer_func(unsigned long __data) 1031 if (sample_taken && !hwp_active)
1043{ 1032 intel_pstate_adjust_busy_pstate(cpu);
1044 struct cpudata *cpu = (struct cpudata *) __data; 1033 }
1045
1046 intel_pstate_sample(cpu);
1047
1048 intel_pstate_adjust_busy_pstate(cpu);
1049
1050 intel_pstate_set_sample_time(cpu);
1051} 1034}
1052 1035
1053#define ICPU(model, policy) \ 1036#define ICPU(model, policy) \
@@ -1095,24 +1078,19 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
1095 1078
1096 cpu->cpu = cpunum; 1079 cpu->cpu = cpunum;
1097 1080
1098 if (hwp_active) 1081 if (hwp_active) {
1099 intel_pstate_hwp_enable(cpu); 1082 intel_pstate_hwp_enable(cpu);
1083 pid_params.sample_rate_ms = 50;
1084 pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
1085 }
1100 1086
1101 intel_pstate_get_cpu_pstates(cpu); 1087 intel_pstate_get_cpu_pstates(cpu);
1102 1088
1103 init_timer_deferrable(&cpu->timer);
1104 cpu->timer.data = (unsigned long)cpu;
1105 cpu->timer.expires = jiffies + HZ/100;
1106
1107 if (!hwp_active)
1108 cpu->timer.function = intel_pstate_timer_func;
1109 else
1110 cpu->timer.function = intel_hwp_timer_func;
1111
1112 intel_pstate_busy_pid_reset(cpu); 1089 intel_pstate_busy_pid_reset(cpu);
1113 intel_pstate_sample(cpu); 1090 intel_pstate_sample(cpu, 0);
1114 1091
1115 add_timer_on(&cpu->timer, cpunum); 1092 cpu->update_util.func = intel_pstate_update_util;
1093 cpufreq_set_update_util_data(cpunum, &cpu->update_util);
1116 1094
1117 pr_debug("intel_pstate: controlling: cpu %d\n", cpunum); 1095 pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
1118 1096
@@ -1128,7 +1106,7 @@ static unsigned int intel_pstate_get(unsigned int cpu_num)
1128 if (!cpu) 1106 if (!cpu)
1129 return 0; 1107 return 0;
1130 sample = &cpu->sample; 1108 sample = &cpu->sample;
1131 return sample->freq; 1109 return get_avg_frequency(cpu);
1132} 1110}
1133 1111
1134static int intel_pstate_set_policy(struct cpufreq_policy *policy) 1112static int intel_pstate_set_policy(struct cpufreq_policy *policy)
@@ -1141,7 +1119,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1141 pr_debug("intel_pstate: set performance\n"); 1119 pr_debug("intel_pstate: set performance\n");
1142 limits = &performance_limits; 1120 limits = &performance_limits;
1143 if (hwp_active) 1121 if (hwp_active)
1144 intel_pstate_hwp_set(); 1122 intel_pstate_hwp_set(policy->cpus);
1145 return 0; 1123 return 0;
1146 } 1124 }
1147 1125
@@ -1173,7 +1151,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
1173 int_tofp(100)); 1151 int_tofp(100));
1174 1152
1175 if (hwp_active) 1153 if (hwp_active)
1176 intel_pstate_hwp_set(); 1154 intel_pstate_hwp_set(policy->cpus);
1177 1155
1178 return 0; 1156 return 0;
1179} 1157}
@@ -1196,7 +1174,9 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
1196 1174
1197 pr_debug("intel_pstate: CPU %d exiting\n", cpu_num); 1175 pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
1198 1176
1199 del_timer_sync(&all_cpu_data[cpu_num]->timer); 1177 cpufreq_set_update_util_data(cpu_num, NULL);
1178 synchronize_sched();
1179
1200 if (hwp_active) 1180 if (hwp_active)
1201 return; 1181 return;
1202 1182
@@ -1260,6 +1240,7 @@ static int intel_pstate_msrs_not_valid(void)
1260static void copy_pid_params(struct pstate_adjust_policy *policy) 1240static void copy_pid_params(struct pstate_adjust_policy *policy)
1261{ 1241{
1262 pid_params.sample_rate_ms = policy->sample_rate_ms; 1242 pid_params.sample_rate_ms = policy->sample_rate_ms;
1243 pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
1263 pid_params.p_gain_pct = policy->p_gain_pct; 1244 pid_params.p_gain_pct = policy->p_gain_pct;
1264 pid_params.i_gain_pct = policy->i_gain_pct; 1245 pid_params.i_gain_pct = policy->i_gain_pct;
1265 pid_params.d_gain_pct = policy->d_gain_pct; 1246 pid_params.d_gain_pct = policy->d_gain_pct;
@@ -1397,6 +1378,11 @@ static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
1397static inline bool intel_pstate_has_acpi_ppc(void) { return false; } 1378static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
1398#endif /* CONFIG_ACPI */ 1379#endif /* CONFIG_ACPI */
1399 1380
1381static const struct x86_cpu_id hwp_support_ids[] __initconst = {
1382 { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
1383 {}
1384};
1385
1400static int __init intel_pstate_init(void) 1386static int __init intel_pstate_init(void)
1401{ 1387{
1402 int cpu, rc = 0; 1388 int cpu, rc = 0;
@@ -1406,17 +1392,16 @@ static int __init intel_pstate_init(void)
1406 if (no_load) 1392 if (no_load)
1407 return -ENODEV; 1393 return -ENODEV;
1408 1394
1395 if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
1396 copy_cpu_funcs(&core_params.funcs);
1397 hwp_active++;
1398 goto hwp_cpu_matched;
1399 }
1400
1409 id = x86_match_cpu(intel_pstate_cpu_ids); 1401 id = x86_match_cpu(intel_pstate_cpu_ids);
1410 if (!id) 1402 if (!id)
1411 return -ENODEV; 1403 return -ENODEV;
1412 1404
1413 /*
1414 * The Intel pstate driver will be ignored if the platform
1415 * firmware has its own power management modes.
1416 */
1417 if (intel_pstate_platform_pwr_mgmt_exists())
1418 return -ENODEV;
1419
1420 cpu_def = (struct cpu_defaults *)id->driver_data; 1405 cpu_def = (struct cpu_defaults *)id->driver_data;
1421 1406
1422 copy_pid_params(&cpu_def->pid_policy); 1407 copy_pid_params(&cpu_def->pid_policy);
@@ -1425,17 +1410,20 @@ static int __init intel_pstate_init(void)
1425 if (intel_pstate_msrs_not_valid()) 1410 if (intel_pstate_msrs_not_valid())
1426 return -ENODEV; 1411 return -ENODEV;
1427 1412
1413hwp_cpu_matched:
1414 /*
1415 * The Intel pstate driver will be ignored if the platform
1416 * firmware has its own power management modes.
1417 */
1418 if (intel_pstate_platform_pwr_mgmt_exists())
1419 return -ENODEV;
1420
1428 pr_info("Intel P-state driver initializing.\n"); 1421 pr_info("Intel P-state driver initializing.\n");
1429 1422
1430 all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); 1423 all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
1431 if (!all_cpu_data) 1424 if (!all_cpu_data)
1432 return -ENOMEM; 1425 return -ENOMEM;
1433 1426
1434 if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
1435 pr_info("intel_pstate: HWP enabled\n");
1436 hwp_active++;
1437 }
1438
1439 if (!hwp_active && hwp_only) 1427 if (!hwp_active && hwp_only)
1440 goto out; 1428 goto out;
1441 1429
@@ -1446,12 +1434,16 @@ static int __init intel_pstate_init(void)
1446 intel_pstate_debug_expose_params(); 1434 intel_pstate_debug_expose_params();
1447 intel_pstate_sysfs_expose_params(); 1435 intel_pstate_sysfs_expose_params();
1448 1436
1437 if (hwp_active)
1438 pr_info("intel_pstate: HWP enabled\n");
1439
1449 return rc; 1440 return rc;
1450out: 1441out:
1451 get_online_cpus(); 1442 get_online_cpus();
1452 for_each_online_cpu(cpu) { 1443 for_each_online_cpu(cpu) {
1453 if (all_cpu_data[cpu]) { 1444 if (all_cpu_data[cpu]) {
1454 del_timer_sync(&all_cpu_data[cpu]->timer); 1445 cpufreq_set_update_util_data(cpu, NULL);
1446 synchronize_sched();
1455 kfree(all_cpu_data[cpu]); 1447 kfree(all_cpu_data[cpu]);
1456 } 1448 }
1457 } 1449 }
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 547890fd9572..50bf12033bbc 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -28,6 +28,8 @@
28#include <linux/of.h> 28#include <linux/of.h>
29#include <linux/reboot.h> 29#include <linux/reboot.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <trace/events/power.h>
31 33
32#include <asm/cputhreads.h> 34#include <asm/cputhreads.h>
33#include <asm/firmware.h> 35#include <asm/firmware.h>
@@ -42,13 +44,24 @@
42 44
43static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; 45static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
44static bool rebooting, throttled, occ_reset; 46static bool rebooting, throttled, occ_reset;
47static unsigned int *core_to_chip_map;
48
49static const char * const throttle_reason[] = {
50 "No throttling",
51 "Power Cap",
52 "Processor Over Temperature",
53 "Power Supply Failure",
54 "Over Current",
55 "OCC Reset"
56};
45 57
46static struct chip { 58static struct chip {
47 unsigned int id; 59 unsigned int id;
48 bool throttled; 60 bool throttled;
61 bool restore;
62 u8 throttle_reason;
49 cpumask_t mask; 63 cpumask_t mask;
50 struct work_struct throttle; 64 struct work_struct throttle;
51 bool restore;
52} *chips; 65} *chips;
53 66
54static int nr_chips; 67static int nr_chips;
@@ -312,13 +325,14 @@ static inline unsigned int get_nominal_index(void)
312static void powernv_cpufreq_throttle_check(void *data) 325static void powernv_cpufreq_throttle_check(void *data)
313{ 326{
314 unsigned int cpu = smp_processor_id(); 327 unsigned int cpu = smp_processor_id();
328 unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)];
315 unsigned long pmsr; 329 unsigned long pmsr;
316 int pmsr_pmax, i; 330 int pmsr_pmax, i;
317 331
318 pmsr = get_pmspr(SPRN_PMSR); 332 pmsr = get_pmspr(SPRN_PMSR);
319 333
320 for (i = 0; i < nr_chips; i++) 334 for (i = 0; i < nr_chips; i++)
321 if (chips[i].id == cpu_to_chip_id(cpu)) 335 if (chips[i].id == chip_id)
322 break; 336 break;
323 337
324 /* Check for Pmax Capping */ 338 /* Check for Pmax Capping */
@@ -328,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data)
328 goto next; 342 goto next;
329 chips[i].throttled = true; 343 chips[i].throttled = true;
330 if (pmsr_pmax < powernv_pstate_info.nominal) 344 if (pmsr_pmax < powernv_pstate_info.nominal)
331 pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", 345 pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
332 cpu, chips[i].id, pmsr_pmax, 346 cpu, chips[i].id, pmsr_pmax,
333 powernv_pstate_info.nominal); 347 powernv_pstate_info.nominal);
334 else 348 trace_powernv_throttle(chips[i].id,
335 pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", 349 throttle_reason[chips[i].throttle_reason],
336 cpu, chips[i].id, pmsr_pmax, 350 pmsr_pmax);
337 powernv_pstate_info.max);
338 } else if (chips[i].throttled) { 351 } else if (chips[i].throttled) {
339 chips[i].throttled = false; 352 chips[i].throttled = false;
340 pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, 353 trace_powernv_throttle(chips[i].id,
341 chips[i].id, pmsr_pmax); 354 throttle_reason[chips[i].throttle_reason],
355 pmsr_pmax);
342 } 356 }
343 357
344 /* Check if Psafe_mode_active is set in PMSR. */ 358 /* Check if Psafe_mode_active is set in PMSR. */
@@ -356,7 +370,7 @@ next:
356 370
357 if (throttled) { 371 if (throttled) {
358 pr_info("PMSR = %16lx\n", pmsr); 372 pr_info("PMSR = %16lx\n", pmsr);
359 pr_crit("CPU Frequency could be throttled\n"); 373 pr_warn("CPU Frequency could be throttled\n");
360 } 374 }
361} 375}
362 376
@@ -423,18 +437,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
423{ 437{
424 struct chip *chip = container_of(work, struct chip, throttle); 438 struct chip *chip = container_of(work, struct chip, throttle);
425 unsigned int cpu; 439 unsigned int cpu;
426 cpumask_var_t mask; 440 cpumask_t mask;
427 441
428 smp_call_function_any(&chip->mask, 442 get_online_cpus();
443 cpumask_and(&mask, &chip->mask, cpu_online_mask);
444 smp_call_function_any(&mask,
429 powernv_cpufreq_throttle_check, NULL, 0); 445 powernv_cpufreq_throttle_check, NULL, 0);
430 446
431 if (!chip->restore) 447 if (!chip->restore)
432 return; 448 goto out;
433 449
434 chip->restore = false; 450 chip->restore = false;
435 cpumask_copy(mask, &chip->mask); 451 for_each_cpu(cpu, &mask) {
436 for_each_cpu_and(cpu, mask, cpu_online_mask) { 452 int index;
437 int index, tcpu;
438 struct cpufreq_policy policy; 453 struct cpufreq_policy policy;
439 454
440 cpufreq_get_policy(&policy, cpu); 455 cpufreq_get_policy(&policy, cpu);
@@ -442,20 +457,12 @@ void powernv_cpufreq_work_fn(struct work_struct *work)
442 policy.cur, 457 policy.cur,
443 CPUFREQ_RELATION_C, &index); 458 CPUFREQ_RELATION_C, &index);
444 powernv_cpufreq_target_index(&policy, index); 459 powernv_cpufreq_target_index(&policy, index);
445 for_each_cpu(tcpu, policy.cpus) 460 cpumask_andnot(&mask, &mask, policy.cpus);
446 cpumask_clear_cpu(tcpu, mask);
447 } 461 }
462out:
463 put_online_cpus();
448} 464}
449 465
450static char throttle_reason[][30] = {
451 "No throttling",
452 "Power Cap",
453 "Processor Over Temperature",
454 "Power Supply Failure",
455 "Over Current",
456 "OCC Reset"
457 };
458
459static int powernv_cpufreq_occ_msg(struct notifier_block *nb, 466static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
460 unsigned long msg_type, void *_msg) 467 unsigned long msg_type, void *_msg)
461{ 468{
@@ -481,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
481 */ 488 */
482 if (!throttled) { 489 if (!throttled) {
483 throttled = true; 490 throttled = true;
484 pr_crit("CPU frequency is throttled for duration\n"); 491 pr_warn("CPU frequency is throttled for duration\n");
485 } 492 }
486 493
487 break; 494 break;
@@ -505,23 +512,18 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
505 return 0; 512 return 0;
506 } 513 }
507 514
508 if (omsg.throttle_status && 515 for (i = 0; i < nr_chips; i++)
516 if (chips[i].id == omsg.chip)
517 break;
518
519 if (omsg.throttle_status >= 0 &&
509 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) 520 omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
510 pr_info("OCC: Chip %u Pmax reduced due to %s\n", 521 chips[i].throttle_reason = omsg.throttle_status;
511 (unsigned int)omsg.chip,
512 throttle_reason[omsg.throttle_status]);
513 else if (!omsg.throttle_status)
514 pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip,
515 throttle_reason[omsg.throttle_status]);
516 else
517 return 0;
518 522
519 for (i = 0; i < nr_chips; i++) 523 if (!omsg.throttle_status)
520 if (chips[i].id == omsg.chip) { 524 chips[i].restore = true;
521 if (!omsg.throttle_status) 525
522 chips[i].restore = true; 526 schedule_work(&chips[i].throttle);
523 schedule_work(&chips[i].throttle);
524 }
525 } 527 }
526 return 0; 528 return 0;
527} 529}
@@ -556,29 +558,54 @@ static int init_chip_info(void)
556 unsigned int chip[256]; 558 unsigned int chip[256];
557 unsigned int cpu, i; 559 unsigned int cpu, i;
558 unsigned int prev_chip_id = UINT_MAX; 560 unsigned int prev_chip_id = UINT_MAX;
561 cpumask_t cpu_mask;
562 int ret = -ENOMEM;
563
564 core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int),
565 GFP_KERNEL);
566 if (!core_to_chip_map)
567 goto out;
559 568
560 for_each_possible_cpu(cpu) { 569 cpumask_copy(&cpu_mask, cpu_possible_mask);
570 for_each_cpu(cpu, &cpu_mask) {
561 unsigned int id = cpu_to_chip_id(cpu); 571 unsigned int id = cpu_to_chip_id(cpu);
562 572
563 if (prev_chip_id != id) { 573 if (prev_chip_id != id) {
564 prev_chip_id = id; 574 prev_chip_id = id;
565 chip[nr_chips++] = id; 575 chip[nr_chips++] = id;
566 } 576 }
577 core_to_chip_map[cpu_core_index_of_thread(cpu)] = id;
578 cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu));
567 } 579 }
568 580
569 chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); 581 chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
570 if (!chips) 582 if (!chips)
571 return -ENOMEM; 583 goto free_chip_map;
572 584
573 for (i = 0; i < nr_chips; i++) { 585 for (i = 0; i < nr_chips; i++) {
574 chips[i].id = chip[i]; 586 chips[i].id = chip[i];
575 chips[i].throttled = false;
576 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); 587 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
577 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); 588 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
578 chips[i].restore = false;
579 } 589 }
580 590
581 return 0; 591 return 0;
592free_chip_map:
593 kfree(core_to_chip_map);
594out:
595 return ret;
596}
597
598static inline void clean_chip_info(void)
599{
600 kfree(chips);
601 kfree(core_to_chip_map);
602}
603
604static inline void unregister_all_notifiers(void)
605{
606 opal_message_notifier_unregister(OPAL_MSG_OCC,
607 &powernv_cpufreq_opal_nb);
608 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
582} 609}
583 610
584static int __init powernv_cpufreq_init(void) 611static int __init powernv_cpufreq_init(void)
@@ -591,28 +618,35 @@ static int __init powernv_cpufreq_init(void)
591 618
592 /* Discover pstates from device tree and init */ 619 /* Discover pstates from device tree and init */
593 rc = init_powernv_pstates(); 620 rc = init_powernv_pstates();
594 if (rc) { 621 if (rc)
595 pr_info("powernv-cpufreq disabled. System does not support PState control\n"); 622 goto out;
596 return rc;
597 }
598 623
599 /* Populate chip info */ 624 /* Populate chip info */
600 rc = init_chip_info(); 625 rc = init_chip_info();
601 if (rc) 626 if (rc)
602 return rc; 627 goto out;
603 628
604 register_reboot_notifier(&powernv_cpufreq_reboot_nb); 629 register_reboot_notifier(&powernv_cpufreq_reboot_nb);
605 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); 630 opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
606 return cpufreq_register_driver(&powernv_cpufreq_driver); 631
632 rc = cpufreq_register_driver(&powernv_cpufreq_driver);
633 if (!rc)
634 return 0;
635
636 pr_info("Failed to register the cpufreq driver (%d)\n", rc);
637 unregister_all_notifiers();
638 clean_chip_info();
639out:
640 pr_info("Platform driver disabled. System does not support PState control\n");
641 return rc;
607} 642}
608module_init(powernv_cpufreq_init); 643module_init(powernv_cpufreq_init);
609 644
610static void __exit powernv_cpufreq_exit(void) 645static void __exit powernv_cpufreq_exit(void)
611{ 646{
612 unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
613 opal_message_notifier_unregister(OPAL_MSG_OCC,
614 &powernv_cpufreq_opal_nb);
615 cpufreq_unregister_driver(&powernv_cpufreq_driver); 647 cpufreq_unregister_driver(&powernv_cpufreq_driver);
648 unregister_all_notifiers();
649 clean_chip_info();
616} 650}
617module_exit(powernv_cpufreq_exit); 651module_exit(powernv_cpufreq_exit);
618 652
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 88a4215125bc..718e8725de8a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,7 +80,6 @@ struct cpufreq_policy {
80 unsigned int last_policy; /* policy before unplug */ 80 unsigned int last_policy; /* policy before unplug */
81 struct cpufreq_governor *governor; /* see below */ 81 struct cpufreq_governor *governor; /* see below */
82 void *governor_data; 82 void *governor_data;
83 bool governor_enabled; /* governor start/stop flag */
84 char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */ 83 char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
85 84
86 struct work_struct update; /* if update_policy() needs to be 85 struct work_struct update; /* if update_policy() needs to be
@@ -100,10 +99,6 @@ struct cpufreq_policy {
100 * - Any routine that will write to the policy structure and/or may take away 99 * - Any routine that will write to the policy structure and/or may take away
101 * the policy altogether (eg. CPU hotplug), will hold this lock in write 100 * the policy altogether (eg. CPU hotplug), will hold this lock in write
102 * mode before doing so. 101 * mode before doing so.
103 *
104 * Additional rules:
105 * - Lock should not be held across
106 * __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
107 */ 102 */
108 struct rw_semaphore rwsem; 103 struct rw_semaphore rwsem;
109 104
@@ -464,29 +459,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
464int cpufreq_register_governor(struct cpufreq_governor *governor); 459int cpufreq_register_governor(struct cpufreq_governor *governor);
465void cpufreq_unregister_governor(struct cpufreq_governor *governor); 460void cpufreq_unregister_governor(struct cpufreq_governor *governor);
466 461
467/* CPUFREQ DEFAULT GOVERNOR */ 462struct cpufreq_governor *cpufreq_default_governor(void);
468/* 463struct cpufreq_governor *cpufreq_fallback_governor(void);
469 * Performance governor is fallback governor if any other gov failed to auto
470 * load due latency restrictions
471 */
472#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
473extern struct cpufreq_governor cpufreq_gov_performance;
474#endif
475#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
476#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_performance)
477#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
478extern struct cpufreq_governor cpufreq_gov_powersave;
479#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_powersave)
480#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
481extern struct cpufreq_governor cpufreq_gov_userspace;
482#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_userspace)
483#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
484extern struct cpufreq_governor cpufreq_gov_ondemand;
485#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_ondemand)
486#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
487extern struct cpufreq_governor cpufreq_gov_conservative;
488#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
489#endif
490 464
491/********************************************************************* 465/*********************************************************************
492 * FREQUENCY TABLE HELPERS * 466 * FREQUENCY TABLE HELPERS *
@@ -525,16 +499,6 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
525} 499}
526#endif 500#endif
527 501
528static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
529{
530 while ((*pos)->frequency != CPUFREQ_TABLE_END)
531 if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
532 return true;
533 else
534 (*pos)++;
535 return false;
536}
537
538/* 502/*
539 * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table 503 * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table
540 * @pos: the cpufreq_frequency_table * to use as a loop cursor. 504 * @pos: the cpufreq_frequency_table * to use as a loop cursor.
@@ -551,8 +515,11 @@ static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
551 * @table: the cpufreq_frequency_table * to iterate over. 515 * @table: the cpufreq_frequency_table * to iterate over.
552 */ 516 */
553 517
554#define cpufreq_for_each_valid_entry(pos, table) \ 518#define cpufreq_for_each_valid_entry(pos, table) \
555 for (pos = table; cpufreq_next_valid(&pos); pos++) 519 for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++) \
520 if (pos->frequency == CPUFREQ_ENTRY_INVALID) \
521 continue; \
522 else
556 523
557int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, 524int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
558 struct cpufreq_frequency_table *table); 525 struct cpufreq_frequency_table *table);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a94cc3..913e755ef7b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
3207 return task_rlimit_max(current, limit); 3207 return task_rlimit_max(current, limit);
3208} 3208}
3209 3209
3210#ifdef CONFIG_CPU_FREQ
3211struct update_util_data {
3212 void (*func)(struct update_util_data *data,
3213 u64 time, unsigned long util, unsigned long max);
3214};
3215
3216void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
3217#endif /* CONFIG_CPU_FREQ */
3218
3210#endif 3219#endif
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 284244ebfe8d..19e50300ce7d 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle,
38 TP_ARGS(state, cpu_id) 38 TP_ARGS(state, cpu_id)
39); 39);
40 40
41TRACE_EVENT(powernv_throttle,
42
43 TP_PROTO(int chip_id, const char *reason, int pmax),
44
45 TP_ARGS(chip_id, reason, pmax),
46
47 TP_STRUCT__entry(
48 __field(int, chip_id)
49 __string(reason, reason)
50 __field(int, pmax)
51 ),
52
53 TP_fast_assign(
54 __entry->chip_id = chip_id;
55 __assign_str(reason, reason);
56 __entry->pmax = pmax;
57 ),
58
59 TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
60 __entry->pmax, __get_str(reason))
61);
62
41TRACE_EVENT(pstate_sample, 63TRACE_EVENT(pstate_sample,
42 64
43 TP_PROTO(u32 core_busy, 65 TP_PROTO(u32 core_busy,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..9507522164ac 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
20obj-$(CONFIG_SCHED_DEBUG) += debug.o 20obj-$(CONFIG_SCHED_DEBUG) += debug.o
21obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o 21obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
22obj-$(CONFIG_CPU_FREQ) += cpufreq.o
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..928c4ba32f68
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,37 @@
1/*
2 * Scheduler code and data structures related to cpufreq.
3 *
4 * Copyright (C) 2016, Intel Corporation
5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include "sched.h"
13
14DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
15
16/**
17 * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
18 * @cpu: The CPU to set the pointer for.
19 * @data: New pointer value.
20 *
21 * Set and publish the update_util_data pointer for the given CPU. That pointer
22 * points to a struct update_util_data object containing a callback function
23 * to call from cpufreq_update_util(). That function will be called from an RCU
24 * read-side critical section, so it must not sleep.
25 *
26 * Callers must use RCU-sched callbacks to free any memory that might be
27 * accessed via the old update_util_data pointer or invoke synchronize_sched()
28 * right after this function to avoid use-after-free.
29 */
30void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
31{
32 if (WARN_ON(data && !data->func))
33 return;
34
35 rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
36}
37EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 57b939c81bce..2037cf432a45 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq)
726 if (!dl_task(curr) || !on_dl_rq(dl_se)) 726 if (!dl_task(curr) || !on_dl_rq(dl_se))
727 return; 727 return;
728 728
729 /* Kick cpufreq (see the comment in linux/cpufreq.h). */
730 if (cpu_of(rq) == smp_processor_id())
731 cpufreq_trigger_update(rq_clock(rq));
732
729 /* 733 /*
730 * Consumed budget is computed considering the time as 734 * Consumed budget is computed considering the time as
731 * observed by schedulable tasks (excluding time spent 735 * observed by schedulable tasks (excluding time spent
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b83947..e2987a7e489d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
2824{ 2824{
2825 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2825 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2826 u64 now = cfs_rq_clock_task(cfs_rq); 2826 u64 now = cfs_rq_clock_task(cfs_rq);
2827 int cpu = cpu_of(rq_of(cfs_rq)); 2827 struct rq *rq = rq_of(cfs_rq);
2828 int cpu = cpu_of(rq);
2828 2829
2829 /* 2830 /*
2830 * Track task load average for carrying it to new CPU after migrated, and 2831 * Track task load average for carrying it to new CPU after migrated, and
@@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
2836 2837
2837 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) 2838 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2838 update_tg_load_avg(cfs_rq, 0); 2839 update_tg_load_avg(cfs_rq, 0);
2840
2841 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2842 unsigned long max = rq->cpu_capacity_orig;
2843
2844 /*
2845 * There are a few boundary cases this might miss but it should
2846 * get called often enough that that should (hopefully) not be
2847 * a real problem -- added to that it only calls on the local
2848 * CPU, so if we enqueue remotely we'll miss an update, but
2849 * the next tick/schedule should update.
2850 *
2851 * It will not get called when we go idle, because the idle
2852 * thread is a different class (!fair), nor will the utilization
2853 * number include things like RT tasks.
2854 *
2855 * As is, the util number is not freq-invariant (we'd have to
2856 * implement arch_scale_freq_capacity() for that).
2857 *
2858 * See cpu_util().
2859 */
2860 cpufreq_update_util(rq_clock(rq),
2861 min(cfs_rq->avg.util_avg, max), max);
2862 }
2839} 2863}
2840 2864
2841static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2865static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..27f5b03cbdbe 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq)
945 if (curr->sched_class != &rt_sched_class) 945 if (curr->sched_class != &rt_sched_class)
946 return; 946 return;
947 947
948 /* Kick cpufreq (see the comment in linux/cpufreq.h). */
949 if (cpu_of(rq) == smp_processor_id())
950 cpufreq_trigger_update(rq_clock(rq));
951
948 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 952 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
949 if (unlikely((s64)delta_exec <= 0)) 953 if (unlikely((s64)delta_exec <= 0))
950 return; 954 return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f16374df7f..faf7e2758dd0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1738,3 +1738,51 @@ static inline u64 irq_time_read(int cpu)
1738} 1738}
1739#endif /* CONFIG_64BIT */ 1739#endif /* CONFIG_64BIT */
1740#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1740#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1741
1742#ifdef CONFIG_CPU_FREQ
1743DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
1744
1745/**
1746 * cpufreq_update_util - Take a note about CPU utilization changes.
1747 * @time: Current time.
1748 * @util: Current utilization.
1749 * @max: Utilization ceiling.
1750 *
1751 * This function is called by the scheduler on every invocation of
1752 * update_load_avg() on the CPU whose utilization is being updated.
1753 *
1754 * It can only be called from RCU-sched read-side critical sections.
1755 */
1756static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
1757{
1758 struct update_util_data *data;
1759
1760 data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
1761 if (data)
1762 data->func(data, time, util, max);
1763}
1764
1765/**
1766 * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
1767 * @time: Current time.
1768 *
1769 * The way cpufreq is currently arranged requires it to evaluate the CPU
1770 * performance state (frequency/voltage) on a regular basis to prevent it from
1771 * being stuck in a completely inadequate performance level for too long.
1772 * That is not guaranteed to happen if the updates are only triggered from CFS,
1773 * though, because they may not be coming in if RT or deadline tasks are active
1774 * all the time (or there are RT and DL tasks only).
1775 *
1776 * As a workaround for that issue, this function is called by the RT and DL
1777 * sched classes to trigger extra cpufreq updates to prevent it from stalling,
1778 * but that really is a band-aid. Going forward it should be replaced with
1779 * solutions targeted more specifically at RT and DL tasks.
1780 */
1781static inline void cpufreq_trigger_update(u64 time)
1782{
1783 cpufreq_update_util(time, ULONG_MAX, 0);
1784}
1785#else
1786static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
1787static inline void cpufreq_trigger_update(u64 time) {}
1788#endif /* CONFIG_CPU_FREQ */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index eb4220a132ec..81b87451c0ea 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,5 @@
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); 16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
18EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
18 19