aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorJacob Pan <jacob.jun.pan@linux.intel.com>2013-01-21 07:37:57 -0500
committerZhang Rui <rui.zhang@intel.com>2013-02-06 00:45:00 -0500
commitd6d71ee4a14ae602db343ec48c491851d7ec5267 (patch)
tree91522b3d9f9d5d63cfe47af65dab2457d601129c /drivers
parent29c6fb7be156ae3c0e202c3903087ab6e57d3ad3 (diff)
PM: Introduce Intel PowerClamp Driver
Intel PowerClamp driver performs synchronized idle injection across all online CPUs. The goal is to maintain a given package level C-state ratio. Compared to other throttling methods already exist in the kernel, such as ACPI PAD (taking CPUs offline) and clock modulation, this is often more efficient in terms of performance per watt. Please refer to Documentation/thermal/intel_powerclamp.txt for more details. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/thermal/Kconfig10
-rw-r--r--drivers/thermal/Makefile2
-rw-r--r--drivers/thermal/intel_powerclamp.c794
3 files changed, 806 insertions, 0 deletions
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index c31b9e4451a3..faf38c522fa8 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -131,4 +131,14 @@ config DB8500_CPUFREQ_COOLING
131 bound cpufreq cooling device turns active to set CPU frequency low to 131 bound cpufreq cooling device turns active to set CPU frequency low to
132 cool down the CPU. 132 cool down the CPU.
133 133
134config INTEL_POWERCLAMP
135 tristate "Intel PowerClamp idle injection driver"
136 depends on THERMAL
137 depends on X86
138 depends on CPU_SUP_INTEL
139 help
140 Enable this to enable Intel PowerClamp idle injection driver. This
141 enforce idle time which results in more package C-state residency. The
142 user interface is exposed via generic thermal framework.
143
134endif 144endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index d8da683245fc..574f5f505b9f 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -18,3 +18,5 @@ obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o
18obj-$(CONFIG_EXYNOS_THERMAL) += exynos_thermal.o 18obj-$(CONFIG_EXYNOS_THERMAL) += exynos_thermal.o
19obj-$(CONFIG_DB8500_THERMAL) += db8500_thermal.o 19obj-$(CONFIG_DB8500_THERMAL) += db8500_thermal.o
20obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o 20obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o
21obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
22
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
new file mode 100644
index 000000000000..a85ff38cb4e8
--- /dev/null
+++ b/drivers/thermal/intel_powerclamp.c
@@ -0,0 +1,794 @@
1/*
2 * intel_powerclamp.c - package c-state idle injection
3 *
4 * Copyright (c) 2012, Intel Corporation.
5 *
6 * Authors:
7 * Arjan van de Ven <arjan@linux.intel.com>
8 * Jacob Pan <jacob.jun.pan@linux.intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 *
23 *
24 * TODO:
25 * 1. better handle wakeup from external interrupts, currently a fixed
26 * compensation is added to clamping duration when excessive amount
27 * of wakeups are observed during idle time. the reason is that in
28 * case of external interrupts without need for ack, clamping down
29 * cpu in non-irq context does not reduce irq. for majority of the
30 * cases, clamping down cpu does help reduce irq as well, we should
31 * be able to differenciate the two cases and give a quantitative
32 * solution for the irqs that we can control. perhaps based on
33 * get_cpu_iowait_time_us()
34 *
35 * 2. synchronization with other hw blocks
36 *
37 *
38 */
39
40#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/delay.h>
45#include <linux/kthread.h>
46#include <linux/freezer.h>
47#include <linux/cpu.h>
48#include <linux/thermal.h>
49#include <linux/slab.h>
50#include <linux/tick.h>
51#include <linux/debugfs.h>
52#include <linux/seq_file.h>
53
54#include <asm/nmi.h>
55#include <asm/msr.h>
56#include <asm/mwait.h>
57#include <asm/cpu_device_id.h>
58#include <asm/idle.h>
59#include <asm/hardirq.h>
60
61#define MAX_TARGET_RATIO (50U)
62/* For each undisturbed clamping period (no extra wake ups during idle time),
63 * we increment the confidence counter for the given target ratio.
64 * CONFIDENCE_OK defines the level where runtime calibration results are
65 * valid.
66 */
67#define CONFIDENCE_OK (3)
68/* Default idle injection duration, driver adjust sleep time to meet target
69 * idle ratio. Similar to frequency modulation.
70 */
71#define DEFAULT_DURATION_JIFFIES (6)
72
73static unsigned int target_mwait;
74static struct dentry *debug_dir;
75
76/* user selected target */
77static unsigned int set_target_ratio;
78static unsigned int current_ratio;
79static bool should_skip;
80static bool reduce_irq;
81static atomic_t idle_wakeup_counter;
82static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83 * control parameters. default to BSP but BSP
84 * can be offlined.
85 */
86static bool clamping;
87
88
89static struct task_struct * __percpu *powerclamp_thread;
90static struct thermal_cooling_device *cooling_dev;
91static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
92 * clamping thread
93 */
94
95static unsigned int duration;
96static unsigned int pkg_cstate_ratio_cur;
97static unsigned int window_size;
98
99static int duration_set(const char *arg, const struct kernel_param *kp)
100{
101 int ret = 0;
102 unsigned long new_duration;
103
104 ret = kstrtoul(arg, 10, &new_duration);
105 if (ret)
106 goto exit;
107 if (new_duration > 25 || new_duration < 6) {
108 pr_err("Out of recommended range %lu, between 6-25ms\n",
109 new_duration);
110 ret = -EINVAL;
111 }
112
113 duration = clamp(new_duration, 6ul, 25ul);
114 smp_mb();
115
116exit:
117
118 return ret;
119}
120
121static struct kernel_param_ops duration_ops = {
122 .set = duration_set,
123 .get = param_get_int,
124};
125
126
127module_param_cb(duration, &duration_ops, &duration, 0644);
128MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
129
130struct powerclamp_calibration_data {
131 unsigned long confidence; /* used for calibration, basically a counter
132 * gets incremented each time a clamping
133 * period is completed without extra wakeups
134 * once that counter is reached given level,
135 * compensation is deemed usable.
136 */
137 unsigned long steady_comp; /* steady state compensation used when
138 * no extra wakeups occurred.
139 */
140 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
141 * mostly from external interrupts.
142 */
143};
144
145static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
146
147static int window_size_set(const char *arg, const struct kernel_param *kp)
148{
149 int ret = 0;
150 unsigned long new_window_size;
151
152 ret = kstrtoul(arg, 10, &new_window_size);
153 if (ret)
154 goto exit_win;
155 if (new_window_size > 10 || new_window_size < 2) {
156 pr_err("Out of recommended window size %lu, between 2-10\n",
157 new_window_size);
158 ret = -EINVAL;
159 }
160
161 window_size = clamp(new_window_size, 2ul, 10ul);
162 smp_mb();
163
164exit_win:
165
166 return ret;
167}
168
169static struct kernel_param_ops window_size_ops = {
170 .set = window_size_set,
171 .get = param_get_int,
172};
173
174module_param_cb(window_size, &window_size_ops, &window_size, 0644);
175MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
176 "\tpowerclamp controls idle ratio within this window. larger\n"
177 "\twindow size results in slower response time but more smooth\n"
178 "\tclamping results. default to 2.");
179
180static void find_target_mwait(void)
181{
182 unsigned int eax, ebx, ecx, edx;
183 unsigned int highest_cstate = 0;
184 unsigned int highest_subcstate = 0;
185 int i;
186
187 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
188 return;
189
190 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
191
192 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
193 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
194 return;
195
196 edx >>= MWAIT_SUBSTATE_SIZE;
197 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
198 if (edx & MWAIT_SUBSTATE_MASK) {
199 highest_cstate = i;
200 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
201 }
202 }
203 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
204 (highest_subcstate - 1);
205
206}
207
208static u64 pkg_state_counter(void)
209{
210 u64 val;
211 u64 count = 0;
212
213 static bool skip_c2;
214 static bool skip_c3;
215 static bool skip_c6;
216 static bool skip_c7;
217
218 if (!skip_c2) {
219 if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
220 count += val;
221 else
222 skip_c2 = true;
223 }
224
225 if (!skip_c3) {
226 if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
227 count += val;
228 else
229 skip_c3 = true;
230 }
231
232 if (!skip_c6) {
233 if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
234 count += val;
235 else
236 skip_c6 = true;
237 }
238
239 if (!skip_c7) {
240 if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
241 count += val;
242 else
243 skip_c7 = true;
244 }
245
246 return count;
247}
248
249static void noop_timer(unsigned long foo)
250{
251 /* empty... just the fact that we get the interrupt wakes us up */
252}
253
254static unsigned int get_compensation(int ratio)
255{
256 unsigned int comp = 0;
257
258 /* we only use compensation if all adjacent ones are good */
259 if (ratio == 1 &&
260 cal_data[ratio].confidence >= CONFIDENCE_OK &&
261 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
262 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
263 comp = (cal_data[ratio].steady_comp +
264 cal_data[ratio + 1].steady_comp +
265 cal_data[ratio + 2].steady_comp) / 3;
266 } else if (ratio == MAX_TARGET_RATIO - 1 &&
267 cal_data[ratio].confidence >= CONFIDENCE_OK &&
268 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
269 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
270 comp = (cal_data[ratio].steady_comp +
271 cal_data[ratio - 1].steady_comp +
272 cal_data[ratio - 2].steady_comp) / 3;
273 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
274 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
276 comp = (cal_data[ratio].steady_comp +
277 cal_data[ratio - 1].steady_comp +
278 cal_data[ratio + 1].steady_comp) / 3;
279 }
280
281 /* REVISIT: simple penalty of double idle injection */
282 if (reduce_irq)
283 comp = ratio;
284 /* do not exceed limit */
285 if (comp + ratio >= MAX_TARGET_RATIO)
286 comp = MAX_TARGET_RATIO - ratio - 1;
287
288 return comp;
289}
290
291static void adjust_compensation(int target_ratio, unsigned int win)
292{
293 int delta;
294 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
295
296 /*
297 * adjust compensations if confidence level has not been reached or
298 * there are too many wakeups during the last idle injection period, we
299 * cannot trust the data for compensation.
300 */
301 if (d->confidence >= CONFIDENCE_OK ||
302 atomic_read(&idle_wakeup_counter) >
303 win * num_online_cpus())
304 return;
305
306 delta = set_target_ratio - current_ratio;
307 /* filter out bad data */
308 if (delta >= 0 && delta <= (1+target_ratio/10)) {
309 if (d->steady_comp)
310 d->steady_comp =
311 roundup(delta+d->steady_comp, 2)/2;
312 else
313 d->steady_comp = delta;
314 d->confidence++;
315 }
316}
317
318static bool powerclamp_adjust_controls(unsigned int target_ratio,
319 unsigned int guard, unsigned int win)
320{
321 static u64 msr_last, tsc_last;
322 u64 msr_now, tsc_now;
323 u64 val64;
324
325 /* check result for the last window */
326 msr_now = pkg_state_counter();
327 rdtscll(tsc_now);
328
329 /* calculate pkg cstate vs tsc ratio */
330 if (!msr_last || !tsc_last)
331 current_ratio = 1;
332 else if (tsc_now-tsc_last) {
333 val64 = 100*(msr_now-msr_last);
334 do_div(val64, (tsc_now-tsc_last));
335 current_ratio = val64;
336 }
337
338 /* update record */
339 msr_last = msr_now;
340 tsc_last = tsc_now;
341
342 adjust_compensation(target_ratio, win);
343 /*
344 * too many external interrupts, set flag such
345 * that we can take measure later.
346 */
347 reduce_irq = atomic_read(&idle_wakeup_counter) >=
348 2 * win * num_online_cpus();
349
350 atomic_set(&idle_wakeup_counter, 0);
351 /* if we are above target+guard, skip */
352 return set_target_ratio + guard <= current_ratio;
353}
354
355static int clamp_thread(void *arg)
356{
357 int cpunr = (unsigned long)arg;
358 DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
359 static const struct sched_param param = {
360 .sched_priority = MAX_USER_RT_PRIO/2,
361 };
362 unsigned int count = 0;
363 unsigned int target_ratio;
364
365 set_bit(cpunr, cpu_clamping_mask);
366 set_freezable();
367 init_timer_on_stack(&wakeup_timer);
368 sched_setscheduler(current, SCHED_FIFO, &param);
369
370 while (true == clamping && !kthread_should_stop() &&
371 cpu_online(cpunr)) {
372 int sleeptime;
373 unsigned long target_jiffies;
374 unsigned int guard;
375 unsigned int compensation = 0;
376 int interval; /* jiffies to sleep for each attempt */
377 unsigned int duration_jiffies = msecs_to_jiffies(duration);
378 unsigned int window_size_now;
379
380 try_to_freeze();
381 /*
382 * make sure user selected ratio does not take effect until
383 * the next round. adjust target_ratio if user has changed
384 * target such that we can converge quickly.
385 */
386 target_ratio = set_target_ratio;
387 guard = 1 + target_ratio/20;
388 window_size_now = window_size;
389 count++;
390
391 /*
392 * systems may have different ability to enter package level
393 * c-states, thus we need to compensate the injected idle ratio
394 * to achieve the actual target reported by the HW.
395 */
396 compensation = get_compensation(target_ratio);
397 interval = duration_jiffies*100/(target_ratio+compensation);
398
399 /* align idle time */
400 target_jiffies = roundup(jiffies, interval);
401 sleeptime = target_jiffies - jiffies;
402 if (sleeptime <= 0)
403 sleeptime = 1;
404 schedule_timeout_interruptible(sleeptime);
405 /*
406 * only elected controlling cpu can collect stats and update
407 * control parameters.
408 */
409 if (cpunr == control_cpu && !(count%window_size_now)) {
410 should_skip =
411 powerclamp_adjust_controls(target_ratio,
412 guard, window_size_now);
413 smp_mb();
414 }
415
416 if (should_skip)
417 continue;
418
419 target_jiffies = jiffies + duration_jiffies;
420 mod_timer(&wakeup_timer, target_jiffies);
421 if (unlikely(local_softirq_pending()))
422 continue;
423 /*
424 * stop tick sched during idle time, interrupts are still
425 * allowed. thus jiffies are updated properly.
426 */
427 preempt_disable();
428 tick_nohz_idle_enter();
429 /* mwait until target jiffies is reached */
430 while (time_before(jiffies, target_jiffies)) {
431 unsigned long ecx = 1;
432 unsigned long eax = target_mwait;
433
434 /*
435 * REVISIT: may call enter_idle() to notify drivers who
436 * can save power during cpu idle. same for exit_idle()
437 */
438 local_touch_nmi();
439 stop_critical_timings();
440 __monitor((void *)&current_thread_info()->flags, 0, 0);
441 cpu_relax(); /* allow HT sibling to run */
442 __mwait(eax, ecx);
443 start_critical_timings();
444 atomic_inc(&idle_wakeup_counter);
445 }
446 tick_nohz_idle_exit();
447 preempt_enable_no_resched();
448 }
449 del_timer_sync(&wakeup_timer);
450 clear_bit(cpunr, cpu_clamping_mask);
451
452 return 0;
453}
454
455/*
456 * 1 HZ polling while clamping is active, useful for userspace
457 * to monitor actual idle ratio.
458 */
459static void poll_pkg_cstate(struct work_struct *dummy);
460static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
461static void poll_pkg_cstate(struct work_struct *dummy)
462{
463 static u64 msr_last;
464 static u64 tsc_last;
465 static unsigned long jiffies_last;
466
467 u64 msr_now;
468 unsigned long jiffies_now;
469 u64 tsc_now;
470 u64 val64;
471
472 msr_now = pkg_state_counter();
473 rdtscll(tsc_now);
474 jiffies_now = jiffies;
475
476 /* calculate pkg cstate vs tsc ratio */
477 if (!msr_last || !tsc_last)
478 pkg_cstate_ratio_cur = 1;
479 else {
480 if (tsc_now - tsc_last) {
481 val64 = 100 * (msr_now - msr_last);
482 do_div(val64, (tsc_now - tsc_last));
483 pkg_cstate_ratio_cur = val64;
484 }
485 }
486
487 /* update record */
488 msr_last = msr_now;
489 jiffies_last = jiffies_now;
490 tsc_last = tsc_now;
491
492 if (true == clamping)
493 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
494}
495
496static int start_power_clamp(void)
497{
498 unsigned long cpu;
499 struct task_struct *thread;
500
501 /* check if pkg cstate counter is completely 0, abort in this case */
502 if (!pkg_state_counter()) {
503 pr_err("pkg cstate counter not functional, abort\n");
504 return -EINVAL;
505 }
506
507 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO);
508 /* prevent cpu hotplug */
509 get_online_cpus();
510
511 /* prefer BSP */
512 control_cpu = 0;
513 if (!cpu_online(control_cpu))
514 control_cpu = smp_processor_id();
515
516 clamping = true;
517 schedule_delayed_work(&poll_pkg_cstate_work, 0);
518
519 /* start one thread per online cpu */
520 for_each_online_cpu(cpu) {
521 struct task_struct **p =
522 per_cpu_ptr(powerclamp_thread, cpu);
523
524 thread = kthread_create_on_node(clamp_thread,
525 (void *) cpu,
526 cpu_to_node(cpu),
527 "kidle_inject/%ld", cpu);
528 /* bind to cpu here */
529 if (likely(!IS_ERR(thread))) {
530 kthread_bind(thread, cpu);
531 wake_up_process(thread);
532 *p = thread;
533 }
534
535 }
536 put_online_cpus();
537
538 return 0;
539}
540
541static void end_power_clamp(void)
542{
543 int i;
544 struct task_struct *thread;
545
546 clamping = false;
547 /*
548 * make clamping visible to other cpus and give per cpu clamping threads
549 * sometime to exit, or gets killed later.
550 */
551 smp_mb();
552 msleep(20);
553 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
554 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
555 pr_debug("clamping thread for cpu %d alive, kill\n", i);
556 thread = *per_cpu_ptr(powerclamp_thread, i);
557 kthread_stop(thread);
558 }
559 }
560}
561
562static int powerclamp_cpu_callback(struct notifier_block *nfb,
563 unsigned long action, void *hcpu)
564{
565 unsigned long cpu = (unsigned long)hcpu;
566 struct task_struct *thread;
567 struct task_struct **percpu_thread =
568 per_cpu_ptr(powerclamp_thread, cpu);
569
570 if (false == clamping)
571 goto exit_ok;
572
573 switch (action) {
574 case CPU_ONLINE:
575 thread = kthread_create_on_node(clamp_thread,
576 (void *) cpu,
577 cpu_to_node(cpu),
578 "kidle_inject/%lu", cpu);
579 if (likely(!IS_ERR(thread))) {
580 kthread_bind(thread, cpu);
581 wake_up_process(thread);
582 *percpu_thread = thread;
583 }
584 /* prefer BSP as controlling CPU */
585 if (cpu == 0) {
586 control_cpu = 0;
587 smp_mb();
588 }
589 break;
590 case CPU_DEAD:
591 if (test_bit(cpu, cpu_clamping_mask)) {
592 pr_err("cpu %lu dead but powerclamping thread is not\n",
593 cpu);
594 kthread_stop(*percpu_thread);
595 }
596 if (cpu == control_cpu) {
597 control_cpu = smp_processor_id();
598 smp_mb();
599 }
600 }
601
602exit_ok:
603 return NOTIFY_OK;
604}
605
606static struct notifier_block powerclamp_cpu_notifier = {
607 .notifier_call = powerclamp_cpu_callback,
608};
609
610static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
611 unsigned long *state)
612{
613 *state = MAX_TARGET_RATIO;
614
615 return 0;
616}
617
618static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
619 unsigned long *state)
620{
621 if (true == clamping)
622 *state = pkg_cstate_ratio_cur;
623 else
624 /* to save power, do not poll idle ratio while not clamping */
625 *state = -1; /* indicates invalid state */
626
627 return 0;
628}
629
630static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
631 unsigned long new_target_ratio)
632{
633 int ret = 0;
634
635 new_target_ratio = clamp(new_target_ratio, 0UL,
636 (unsigned long) (MAX_TARGET_RATIO-1));
637 if (set_target_ratio == 0 && new_target_ratio > 0) {
638 pr_info("Start idle injection to reduce power\n");
639 set_target_ratio = new_target_ratio;
640 ret = start_power_clamp();
641 goto exit_set;
642 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
643 pr_info("Stop forced idle injection\n");
644 set_target_ratio = 0;
645 end_power_clamp();
646 } else /* adjust currently running */ {
647 set_target_ratio = new_target_ratio;
648 /* make new set_target_ratio visible to other cpus */
649 smp_mb();
650 }
651
652exit_set:
653 return ret;
654}
655
656/* bind to generic thermal layer as cooling device*/
657static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
658 .get_max_state = powerclamp_get_max_state,
659 .get_cur_state = powerclamp_get_cur_state,
660 .set_cur_state = powerclamp_set_cur_state,
661};
662
663/* runs on Nehalem and later */
664static const struct x86_cpu_id intel_powerclamp_ids[] = {
665 { X86_VENDOR_INTEL, 6, 0x1a},
666 { X86_VENDOR_INTEL, 6, 0x1c},
667 { X86_VENDOR_INTEL, 6, 0x1e},
668 { X86_VENDOR_INTEL, 6, 0x1f},
669 { X86_VENDOR_INTEL, 6, 0x25},
670 { X86_VENDOR_INTEL, 6, 0x26},
671 { X86_VENDOR_INTEL, 6, 0x2a},
672 { X86_VENDOR_INTEL, 6, 0x2c},
673 { X86_VENDOR_INTEL, 6, 0x2d},
674 { X86_VENDOR_INTEL, 6, 0x2e},
675 { X86_VENDOR_INTEL, 6, 0x2f},
676 { X86_VENDOR_INTEL, 6, 0x3a},
677 {}
678};
679MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
680
681static int powerclamp_probe(void)
682{
683 if (!x86_match_cpu(intel_powerclamp_ids)) {
684 pr_err("Intel powerclamp does not run on family %d model %d\n",
685 boot_cpu_data.x86, boot_cpu_data.x86_model);
686 return -ENODEV;
687 }
688 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
689 !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
690 !boot_cpu_has(X86_FEATURE_MWAIT) ||
691 !boot_cpu_has(X86_FEATURE_ARAT))
692 return -ENODEV;
693
694 /* find the deepest mwait value */
695 find_target_mwait();
696
697 return 0;
698}
699
700static int powerclamp_debug_show(struct seq_file *m, void *unused)
701{
702 int i = 0;
703
704 seq_printf(m, "controlling cpu: %d\n", control_cpu);
705 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
706 for (i = 0; i < MAX_TARGET_RATIO; i++) {
707 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
708 i,
709 cal_data[i].confidence,
710 cal_data[i].steady_comp,
711 cal_data[i].dynamic_comp);
712 }
713
714 return 0;
715}
716
717static int powerclamp_debug_open(struct inode *inode,
718 struct file *file)
719{
720 return single_open(file, powerclamp_debug_show, inode->i_private);
721}
722
723static const struct file_operations powerclamp_debug_fops = {
724 .open = powerclamp_debug_open,
725 .read = seq_read,
726 .llseek = seq_lseek,
727 .release = single_release,
728 .owner = THIS_MODULE,
729};
730
731static inline void powerclamp_create_debug_files(void)
732{
733 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
734 if (!debug_dir)
735 return;
736
737 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
738 cal_data, &powerclamp_debug_fops))
739 goto file_error;
740
741 return;
742
743file_error:
744 debugfs_remove_recursive(debug_dir);
745}
746
747static int powerclamp_init(void)
748{
749 int retval;
750 int bitmap_size;
751
752 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
753 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
754 if (!cpu_clamping_mask)
755 return -ENOMEM;
756
757 /* probe cpu features and ids here */
758 retval = powerclamp_probe();
759 if (retval)
760 return retval;
761 /* set default limit, maybe adjusted during runtime based on feedback */
762 window_size = 2;
763 register_hotcpu_notifier(&powerclamp_cpu_notifier);
764 powerclamp_thread = alloc_percpu(struct task_struct *);
765 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
766 &powerclamp_cooling_ops);
767 if (IS_ERR(cooling_dev))
768 return -ENODEV;
769
770 if (!duration)
771 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
772 powerclamp_create_debug_files();
773
774 return 0;
775}
776module_init(powerclamp_init);
777
778static void powerclamp_exit(void)
779{
780 unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
781 end_power_clamp();
782 free_percpu(powerclamp_thread);
783 thermal_cooling_device_unregister(cooling_dev);
784 kfree(cpu_clamping_mask);
785
786 cancel_delayed_work_sync(&poll_pkg_cstate_work);
787 debugfs_remove_recursive(debug_dir);
788}
789module_exit(powerclamp_exit);
790
791MODULE_LICENSE("GPL");
792MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
793MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
794MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");