aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenkatesh Pallipadi <venkatesh.pallipadi@intel.com>2006-10-03 15:38:45 -0400
committerDave Jones <davej@redhat.com>2006-10-15 19:57:11 -0400
commitdfde5d62ed9b28b0bda676c16e8cb635df244ef2 (patch)
tree12c690189fcc7155389860beae554199456b7d3e
parenta6f6e6e6ab464c9d1dff66570b78be2f66d8ba3d (diff)
[CPUFREQ][8/8] acpi-cpufreq: Add support for freq feedback from hardware
Enable ondemand governor and acpi-cpufreq to use IA32_APERF and IA32_MPERF MSR to get active frequency feedback for the last sampling interval. This will make ondemand take right frequency decisions when hardware coordination of frequency is going on. Without APERF/MPERF, ondemand can take wrong decision at times due to underlying hardware coordination or TM2. Example: * CPU 0 and CPU 1 are hardware cooridnated. * CPU 1 running at highest frequency. * CPU 0 was running at highest freq. Now ondemand reduces it to some intermediate frequency based on utilization. * Due to underlying hardware coordination with other CPU 1, CPU 0 continues to run at highest frequency (as long as other CPU is at highest). * When ondemand samples CPU 0 again next time, without actual frequency feedback from APERF/MPERF, it will think that previous frequency change was successful and can go to wrong target frequency. This is because it thinks that utilization it has got this sampling interval is when running at intermediate frequency, rather than actual highest frequency. More information about IA32_APERF IA32_MPERF MSR: Refer to IA-32 IntelĀ® Architecture Software Developer's Manual at http://developer.intel.com Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Dave Jones <davej@redhat.com>
-rw-r--r--arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c107
-rw-r--r--drivers/cpufreq/cpufreq.c20
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c9
-rw-r--r--include/asm-i386/msr.h3
-rw-r--r--include/asm-x86_64/msr.h3
-rw-r--r--include/linux/cpufreq.h3
6 files changed, 143 insertions, 2 deletions
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8b0c7db85a47..f8a8e46acb78 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -58,10 +58,12 @@ enum {
58}; 58};
59 59
60#define INTEL_MSR_RANGE (0xffff) 60#define INTEL_MSR_RANGE (0xffff)
61#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
61 62
62struct acpi_cpufreq_data { 63struct acpi_cpufreq_data {
63 struct acpi_processor_performance *acpi_data; 64 struct acpi_processor_performance *acpi_data;
64 struct cpufreq_frequency_table *freq_table; 65 struct cpufreq_frequency_table *freq_table;
66 unsigned int max_freq;
65 unsigned int resume; 67 unsigned int resume;
66 unsigned int cpu_feature; 68 unsigned int cpu_feature;
67}; 69};
@@ -258,6 +260,100 @@ static u32 get_cur_val(cpumask_t mask)
258 return cmd.val; 260 return cmd.val;
259} 261}
260 262
263/*
264 * Return the measured active (C0) frequency on this CPU since last call
265 * to this function.
266 * Input: cpu number
267 * Return: Average CPU frequency in terms of max frequency (zero on error)
268 *
269 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
270 * over a period of time, while CPU is in C0 state.
271 * IA32_MPERF counts at the rate of max advertised frequency
272 * IA32_APERF counts at the rate of actual CPU frequency
273 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
274 * no meaning should be associated with absolute values of these MSRs.
275 */
276static unsigned int get_measured_perf(unsigned int cpu)
277{
278 union {
279 struct {
280 u32 lo;
281 u32 hi;
282 } split;
283 u64 whole;
284 } aperf_cur, mperf_cur;
285
286 cpumask_t saved_mask;
287 unsigned int perf_percent;
288 unsigned int retval;
289
290 saved_mask = current->cpus_allowed;
291 set_cpus_allowed(current, cpumask_of_cpu(cpu));
292 if (get_cpu() != cpu) {
293 /* We were not able to run on requested processor */
294 put_cpu();
295 return 0;
296 }
297
298 rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
299 rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
300
301 wrmsr(MSR_IA32_APERF, 0,0);
302 wrmsr(MSR_IA32_MPERF, 0,0);
303
304#ifdef __i386__
305 /*
306 * We dont want to do 64 bit divide with 32 bit kernel
307 * Get an approximate value. Return failure in case we cannot get
308 * an approximate value.
309 */
310 if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) {
311 int shift_count;
312 u32 h;
313
314 h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi);
315 shift_count = fls(h);
316
317 aperf_cur.whole >>= shift_count;
318 mperf_cur.whole >>= shift_count;
319 }
320
321 if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) {
322 int shift_count = 7;
323 aperf_cur.split.lo >>= shift_count;
324 mperf_cur.split.lo >>= shift_count;
325 }
326
327 if (aperf_cur.split.lo && mperf_cur.split.lo) {
328 perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo;
329 } else {
330 perf_percent = 0;
331 }
332
333#else
334 if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
335 int shift_count = 7;
336 aperf_cur.whole >>= shift_count;
337 mperf_cur.whole >>= shift_count;
338 }
339
340 if (aperf_cur.whole && mperf_cur.whole) {
341 perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
342 } else {
343 perf_percent = 0;
344 }
345
346#endif
347
348 retval = drv_data[cpu]->max_freq * perf_percent / 100;
349
350 put_cpu();
351 set_cpus_allowed(current, saved_mask);
352
353 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
354 return retval;
355}
356
261static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 357static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
262{ 358{
263 struct acpi_cpufreq_data *data = drv_data[cpu]; 359 struct acpi_cpufreq_data *data = drv_data[cpu];
@@ -497,7 +593,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
497 unsigned int valid_states = 0; 593 unsigned int valid_states = 0;
498 unsigned int cpu = policy->cpu; 594 unsigned int cpu = policy->cpu;
499 struct acpi_cpufreq_data *data; 595 struct acpi_cpufreq_data *data;
500 unsigned int l, h;
501 unsigned int result = 0; 596 unsigned int result = 0;
502 struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; 597 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
503 struct acpi_processor_performance *perf; 598 struct acpi_processor_performance *perf;
@@ -591,6 +686,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
591 } 686 }
592 policy->governor = CPUFREQ_DEFAULT_GOVERNOR; 687 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
593 688
689 data->max_freq = perf->states[0].core_frequency * 1000;
594 /* table init */ 690 /* table init */
595 for (i = 0; i < perf->state_count; i++) { 691 for (i = 0; i < perf->state_count; i++) {
596 if (i > 0 && perf->states[i].core_frequency == 692 if (i > 0 && perf->states[i].core_frequency ==
@@ -625,6 +721,15 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
625 /* notify BIOS that we exist */ 721 /* notify BIOS that we exist */
626 acpi_processor_notify_smm(THIS_MODULE); 722 acpi_processor_notify_smm(THIS_MODULE);
627 723
724 /* Check for APERF/MPERF support in hardware */
725 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
726 unsigned int ecx;
727 ecx = cpuid_ecx(6);
728 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) {
729 acpi_cpufreq_driver.getavg = get_measured_perf;
730 }
731 }
732
628 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 733 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
629 for (i = 0; i < perf->state_count; i++) 734 for (i = 0; i < perf->state_count; i++)
630 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", 735 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 86e69b7f9122..56c433e64d58 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1274,6 +1274,26 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
1274} 1274}
1275EXPORT_SYMBOL_GPL(cpufreq_driver_target); 1275EXPORT_SYMBOL_GPL(cpufreq_driver_target);
1276 1276
1277int cpufreq_driver_getavg(struct cpufreq_policy *policy)
1278{
1279 int ret = 0;
1280
1281 policy = cpufreq_cpu_get(policy->cpu);
1282 if (!policy)
1283 return -EINVAL;
1284
1285 mutex_lock(&policy->lock);
1286
1287 if (cpu_online(policy->cpu) && cpufreq_driver->getavg)
1288 ret = cpufreq_driver->getavg(policy->cpu);
1289
1290 mutex_unlock(&policy->lock);
1291
1292 cpufreq_cpu_put(policy);
1293 return ret;
1294}
1295EXPORT_SYMBOL_GPL(cpufreq_driver_getavg);
1296
1277/* 1297/*
1278 * Locking: Must be called with the lock_cpu_hotplug() lock held 1298 * Locking: Must be called with the lock_cpu_hotplug() lock held
1279 * when "event" is CPUFREQ_GOV_LIMITS 1299 * when "event" is CPUFREQ_GOV_LIMITS
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bf8aa45d4f01..291cfe9400a1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -393,8 +393,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
393 * policy. To be safe, we focus 10 points under the threshold. 393 * policy. To be safe, we focus 10 points under the threshold.
394 */ 394 */
395 if (load < (dbs_tuners_ins.up_threshold - 10)) { 395 if (load < (dbs_tuners_ins.up_threshold - 10)) {
396 unsigned int freq_next = (policy->cur * load) / 396 unsigned int freq_next, freq_cur;
397
398 freq_cur = cpufreq_driver_getavg(policy);
399 if (!freq_cur)
400 freq_cur = policy->cur;
401
402 freq_next = (freq_cur * load) /
397 (dbs_tuners_ins.up_threshold - 10); 403 (dbs_tuners_ins.up_threshold - 10);
404
398 if (!dbs_tuners_ins.powersave_bias) { 405 if (!dbs_tuners_ins.powersave_bias) {
399 __cpufreq_driver_target(policy, freq_next, 406 __cpufreq_driver_target(policy, freq_next,
400 CPUFREQ_RELATION_L); 407 CPUFREQ_RELATION_L);
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 62b76cd96957..0aa15fc8d918 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -125,6 +125,9 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
125#define MSR_IA32_PERF_STATUS 0x198 125#define MSR_IA32_PERF_STATUS 0x198
126#define MSR_IA32_PERF_CTL 0x199 126#define MSR_IA32_PERF_CTL 0x199
127 127
128#define MSR_IA32_MPERF 0xE7
129#define MSR_IA32_APERF 0xE8
130
128#define MSR_IA32_THERM_CONTROL 0x19a 131#define MSR_IA32_THERM_CONTROL 0x19a
129#define MSR_IA32_THERM_INTERRUPT 0x19b 132#define MSR_IA32_THERM_INTERRUPT 0x19b
130#define MSR_IA32_THERM_STATUS 0x19c 133#define MSR_IA32_THERM_STATUS 0x19c
diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 37e194169fac..e61582288737 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -307,6 +307,9 @@ static inline unsigned int cpuid_edx(unsigned int op)
307#define MSR_IA32_PERF_STATUS 0x198 307#define MSR_IA32_PERF_STATUS 0x198
308#define MSR_IA32_PERF_CTL 0x199 308#define MSR_IA32_PERF_CTL 0x199
309 309
310#define MSR_IA32_MPERF 0xE7
311#define MSR_IA32_APERF 0xE8
312
310#define MSR_IA32_THERM_CONTROL 0x19a 313#define MSR_IA32_THERM_CONTROL 0x19a
311#define MSR_IA32_THERM_INTERRUPT 0x19b 314#define MSR_IA32_THERM_INTERRUPT 0x19b
312#define MSR_IA32_THERM_STATUS 0x19c 315#define MSR_IA32_THERM_STATUS 0x19c
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4ea39fee99c7..7f008f6bfdc3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -172,6 +172,8 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
172 unsigned int relation); 172 unsigned int relation);
173 173
174 174
175extern int cpufreq_driver_getavg(struct cpufreq_policy *policy);
176
175int cpufreq_register_governor(struct cpufreq_governor *governor); 177int cpufreq_register_governor(struct cpufreq_governor *governor);
176void cpufreq_unregister_governor(struct cpufreq_governor *governor); 178void cpufreq_unregister_governor(struct cpufreq_governor *governor);
177 179
@@ -204,6 +206,7 @@ struct cpufreq_driver {
204 unsigned int (*get) (unsigned int cpu); 206 unsigned int (*get) (unsigned int cpu);
205 207
206 /* optional */ 208 /* optional */
209 unsigned int (*getavg) (unsigned int cpu);
207 int (*exit) (struct cpufreq_policy *policy); 210 int (*exit) (struct cpufreq_policy *policy);
208 int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); 211 int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg);
209 int (*resume) (struct cpufreq_policy *policy); 212 int (*resume) (struct cpufreq_policy *policy);