diff options
Diffstat (limited to 'drivers/thermal/intel/intel_powerclamp.c')
-rw-r--r-- | drivers/thermal/intel/intel_powerclamp.c | 815 |
1 files changed, 815 insertions, 0 deletions
diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c new file mode 100644 index 000000000000..cde891c54cde --- /dev/null +++ b/drivers/thermal/intel/intel_powerclamp.c | |||
@@ -0,0 +1,815 @@ | |||
1 | /* | ||
2 | * intel_powerclamp.c - package c-state idle injection | ||
3 | * | ||
4 | * Copyright (c) 2012, Intel Corporation. | ||
5 | * | ||
6 | * Authors: | ||
7 | * Arjan van de Ven <arjan@linux.intel.com> | ||
8 | * Jacob Pan <jacob.jun.pan@linux.intel.com> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify it | ||
11 | * under the terms and conditions of the GNU General Public License, | ||
12 | * version 2, as published by the Free Software Foundation. | ||
13 | * | ||
14 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
17 | * more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License along with | ||
20 | * this program; if not, write to the Free Software Foundation, Inc., | ||
21 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
22 | * | ||
23 | * | ||
24 | * TODO: | ||
25 | * 1. better handle wakeup from external interrupts, currently a fixed | ||
26 | * compensation is added to clamping duration when excessive amount | ||
27 | * of wakeups are observed during idle time. the reason is that in | ||
28 | * case of external interrupts without need for ack, clamping down | ||
29 | * cpu in non-irq context does not reduce irq. for majority of the | ||
30 | * cases, clamping down cpu does help reduce irq as well, we should | ||
31 | * be able to differentiate the two cases and give a quantitative | ||
32 | * solution for the irqs that we can control. perhaps based on | ||
33 | * get_cpu_iowait_time_us() | ||
34 | * | ||
35 | * 2. synchronization with other hw blocks | ||
36 | * | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
41 | |||
42 | #include <linux/module.h> | ||
43 | #include <linux/kernel.h> | ||
44 | #include <linux/delay.h> | ||
45 | #include <linux/kthread.h> | ||
46 | #include <linux/cpu.h> | ||
47 | #include <linux/thermal.h> | ||
48 | #include <linux/slab.h> | ||
49 | #include <linux/tick.h> | ||
50 | #include <linux/debugfs.h> | ||
51 | #include <linux/seq_file.h> | ||
52 | #include <linux/sched/rt.h> | ||
53 | #include <uapi/linux/sched/types.h> | ||
54 | |||
55 | #include <asm/nmi.h> | ||
56 | #include <asm/msr.h> | ||
57 | #include <asm/mwait.h> | ||
58 | #include <asm/cpu_device_id.h> | ||
59 | #include <asm/hardirq.h> | ||
60 | |||
61 | #define MAX_TARGET_RATIO (50U) | ||
62 | /* For each undisturbed clamping period (no extra wake ups during idle time), | ||
63 | * we increment the confidence counter for the given target ratio. | ||
64 | * CONFIDENCE_OK defines the level where runtime calibration results are | ||
65 | * valid. | ||
66 | */ | ||
67 | #define CONFIDENCE_OK (3) | ||
68 | /* Default idle injection duration, driver adjust sleep time to meet target | ||
69 | * idle ratio. Similar to frequency modulation. | ||
70 | */ | ||
71 | #define DEFAULT_DURATION_JIFFIES (6) | ||
72 | |||
73 | static unsigned int target_mwait; | ||
74 | static struct dentry *debug_dir; | ||
75 | |||
76 | /* user selected target */ | ||
77 | static unsigned int set_target_ratio; | ||
78 | static unsigned int current_ratio; | ||
79 | static bool should_skip; | ||
80 | static bool reduce_irq; | ||
81 | static atomic_t idle_wakeup_counter; | ||
82 | static unsigned int control_cpu; /* The cpu assigned to collect stat and update | ||
83 | * control parameters. default to BSP but BSP | ||
84 | * can be offlined. | ||
85 | */ | ||
86 | static bool clamping; | ||
87 | |||
88 | static const struct sched_param sparam = { | ||
89 | .sched_priority = MAX_USER_RT_PRIO / 2, | ||
90 | }; | ||
91 | struct powerclamp_worker_data { | ||
92 | struct kthread_worker *worker; | ||
93 | struct kthread_work balancing_work; | ||
94 | struct kthread_delayed_work idle_injection_work; | ||
95 | unsigned int cpu; | ||
96 | unsigned int count; | ||
97 | unsigned int guard; | ||
98 | unsigned int window_size_now; | ||
99 | unsigned int target_ratio; | ||
100 | unsigned int duration_jiffies; | ||
101 | bool clamping; | ||
102 | }; | ||
103 | |||
104 | static struct powerclamp_worker_data * __percpu worker_data; | ||
105 | static struct thermal_cooling_device *cooling_dev; | ||
106 | static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu | ||
107 | * clamping kthread worker | ||
108 | */ | ||
109 | |||
110 | static unsigned int duration; | ||
111 | static unsigned int pkg_cstate_ratio_cur; | ||
112 | static unsigned int window_size; | ||
113 | |||
114 | static int duration_set(const char *arg, const struct kernel_param *kp) | ||
115 | { | ||
116 | int ret = 0; | ||
117 | unsigned long new_duration; | ||
118 | |||
119 | ret = kstrtoul(arg, 10, &new_duration); | ||
120 | if (ret) | ||
121 | goto exit; | ||
122 | if (new_duration > 25 || new_duration < 6) { | ||
123 | pr_err("Out of recommended range %lu, between 6-25ms\n", | ||
124 | new_duration); | ||
125 | ret = -EINVAL; | ||
126 | } | ||
127 | |||
128 | duration = clamp(new_duration, 6ul, 25ul); | ||
129 | smp_mb(); | ||
130 | |||
131 | exit: | ||
132 | |||
133 | return ret; | ||
134 | } | ||
135 | |||
136 | static const struct kernel_param_ops duration_ops = { | ||
137 | .set = duration_set, | ||
138 | .get = param_get_int, | ||
139 | }; | ||
140 | |||
141 | |||
142 | module_param_cb(duration, &duration_ops, &duration, 0644); | ||
143 | MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec."); | ||
144 | |||
145 | struct powerclamp_calibration_data { | ||
146 | unsigned long confidence; /* used for calibration, basically a counter | ||
147 | * gets incremented each time a clamping | ||
148 | * period is completed without extra wakeups | ||
149 | * once that counter is reached given level, | ||
150 | * compensation is deemed usable. | ||
151 | */ | ||
152 | unsigned long steady_comp; /* steady state compensation used when | ||
153 | * no extra wakeups occurred. | ||
154 | */ | ||
155 | unsigned long dynamic_comp; /* compensate excessive wakeup from idle | ||
156 | * mostly from external interrupts. | ||
157 | */ | ||
158 | }; | ||
159 | |||
160 | static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO]; | ||
161 | |||
162 | static int window_size_set(const char *arg, const struct kernel_param *kp) | ||
163 | { | ||
164 | int ret = 0; | ||
165 | unsigned long new_window_size; | ||
166 | |||
167 | ret = kstrtoul(arg, 10, &new_window_size); | ||
168 | if (ret) | ||
169 | goto exit_win; | ||
170 | if (new_window_size > 10 || new_window_size < 2) { | ||
171 | pr_err("Out of recommended window size %lu, between 2-10\n", | ||
172 | new_window_size); | ||
173 | ret = -EINVAL; | ||
174 | } | ||
175 | |||
176 | window_size = clamp(new_window_size, 2ul, 10ul); | ||
177 | smp_mb(); | ||
178 | |||
179 | exit_win: | ||
180 | |||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | static const struct kernel_param_ops window_size_ops = { | ||
185 | .set = window_size_set, | ||
186 | .get = param_get_int, | ||
187 | }; | ||
188 | |||
189 | module_param_cb(window_size, &window_size_ops, &window_size, 0644); | ||
190 | MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" | ||
191 | "\tpowerclamp controls idle ratio within this window. larger\n" | ||
192 | "\twindow size results in slower response time but more smooth\n" | ||
193 | "\tclamping results. default to 2."); | ||
194 | |||
195 | static void find_target_mwait(void) | ||
196 | { | ||
197 | unsigned int eax, ebx, ecx, edx; | ||
198 | unsigned int highest_cstate = 0; | ||
199 | unsigned int highest_subcstate = 0; | ||
200 | int i; | ||
201 | |||
202 | if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) | ||
203 | return; | ||
204 | |||
205 | cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); | ||
206 | |||
207 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || | ||
208 | !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) | ||
209 | return; | ||
210 | |||
211 | edx >>= MWAIT_SUBSTATE_SIZE; | ||
212 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { | ||
213 | if (edx & MWAIT_SUBSTATE_MASK) { | ||
214 | highest_cstate = i; | ||
215 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; | ||
216 | } | ||
217 | } | ||
218 | target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) | | ||
219 | (highest_subcstate - 1); | ||
220 | |||
221 | } | ||
222 | |||
223 | struct pkg_cstate_info { | ||
224 | bool skip; | ||
225 | int msr_index; | ||
226 | int cstate_id; | ||
227 | }; | ||
228 | |||
229 | #define PKG_CSTATE_INIT(id) { \ | ||
230 | .msr_index = MSR_PKG_C##id##_RESIDENCY, \ | ||
231 | .cstate_id = id \ | ||
232 | } | ||
233 | |||
234 | static struct pkg_cstate_info pkg_cstates[] = { | ||
235 | PKG_CSTATE_INIT(2), | ||
236 | PKG_CSTATE_INIT(3), | ||
237 | PKG_CSTATE_INIT(6), | ||
238 | PKG_CSTATE_INIT(7), | ||
239 | PKG_CSTATE_INIT(8), | ||
240 | PKG_CSTATE_INIT(9), | ||
241 | PKG_CSTATE_INIT(10), | ||
242 | {NULL}, | ||
243 | }; | ||
244 | |||
245 | static bool has_pkg_state_counter(void) | ||
246 | { | ||
247 | u64 val; | ||
248 | struct pkg_cstate_info *info = pkg_cstates; | ||
249 | |||
250 | /* check if any one of the counter msrs exists */ | ||
251 | while (info->msr_index) { | ||
252 | if (!rdmsrl_safe(info->msr_index, &val)) | ||
253 | return true; | ||
254 | info++; | ||
255 | } | ||
256 | |||
257 | return false; | ||
258 | } | ||
259 | |||
260 | static u64 pkg_state_counter(void) | ||
261 | { | ||
262 | u64 val; | ||
263 | u64 count = 0; | ||
264 | struct pkg_cstate_info *info = pkg_cstates; | ||
265 | |||
266 | while (info->msr_index) { | ||
267 | if (!info->skip) { | ||
268 | if (!rdmsrl_safe(info->msr_index, &val)) | ||
269 | count += val; | ||
270 | else | ||
271 | info->skip = true; | ||
272 | } | ||
273 | info++; | ||
274 | } | ||
275 | |||
276 | return count; | ||
277 | } | ||
278 | |||
279 | static unsigned int get_compensation(int ratio) | ||
280 | { | ||
281 | unsigned int comp = 0; | ||
282 | |||
283 | /* we only use compensation if all adjacent ones are good */ | ||
284 | if (ratio == 1 && | ||
285 | cal_data[ratio].confidence >= CONFIDENCE_OK && | ||
286 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK && | ||
287 | cal_data[ratio + 2].confidence >= CONFIDENCE_OK) { | ||
288 | comp = (cal_data[ratio].steady_comp + | ||
289 | cal_data[ratio + 1].steady_comp + | ||
290 | cal_data[ratio + 2].steady_comp) / 3; | ||
291 | } else if (ratio == MAX_TARGET_RATIO - 1 && | ||
292 | cal_data[ratio].confidence >= CONFIDENCE_OK && | ||
293 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && | ||
294 | cal_data[ratio - 2].confidence >= CONFIDENCE_OK) { | ||
295 | comp = (cal_data[ratio].steady_comp + | ||
296 | cal_data[ratio - 1].steady_comp + | ||
297 | cal_data[ratio - 2].steady_comp) / 3; | ||
298 | } else if (cal_data[ratio].confidence >= CONFIDENCE_OK && | ||
299 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && | ||
300 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK) { | ||
301 | comp = (cal_data[ratio].steady_comp + | ||
302 | cal_data[ratio - 1].steady_comp + | ||
303 | cal_data[ratio + 1].steady_comp) / 3; | ||
304 | } | ||
305 | |||
306 | /* REVISIT: simple penalty of double idle injection */ | ||
307 | if (reduce_irq) | ||
308 | comp = ratio; | ||
309 | /* do not exceed limit */ | ||
310 | if (comp + ratio >= MAX_TARGET_RATIO) | ||
311 | comp = MAX_TARGET_RATIO - ratio - 1; | ||
312 | |||
313 | return comp; | ||
314 | } | ||
315 | |||
316 | static void adjust_compensation(int target_ratio, unsigned int win) | ||
317 | { | ||
318 | int delta; | ||
319 | struct powerclamp_calibration_data *d = &cal_data[target_ratio]; | ||
320 | |||
321 | /* | ||
322 | * adjust compensations if confidence level has not been reached or | ||
323 | * there are too many wakeups during the last idle injection period, we | ||
324 | * cannot trust the data for compensation. | ||
325 | */ | ||
326 | if (d->confidence >= CONFIDENCE_OK || | ||
327 | atomic_read(&idle_wakeup_counter) > | ||
328 | win * num_online_cpus()) | ||
329 | return; | ||
330 | |||
331 | delta = set_target_ratio - current_ratio; | ||
332 | /* filter out bad data */ | ||
333 | if (delta >= 0 && delta <= (1+target_ratio/10)) { | ||
334 | if (d->steady_comp) | ||
335 | d->steady_comp = | ||
336 | roundup(delta+d->steady_comp, 2)/2; | ||
337 | else | ||
338 | d->steady_comp = delta; | ||
339 | d->confidence++; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | static bool powerclamp_adjust_controls(unsigned int target_ratio, | ||
344 | unsigned int guard, unsigned int win) | ||
345 | { | ||
346 | static u64 msr_last, tsc_last; | ||
347 | u64 msr_now, tsc_now; | ||
348 | u64 val64; | ||
349 | |||
350 | /* check result for the last window */ | ||
351 | msr_now = pkg_state_counter(); | ||
352 | tsc_now = rdtsc(); | ||
353 | |||
354 | /* calculate pkg cstate vs tsc ratio */ | ||
355 | if (!msr_last || !tsc_last) | ||
356 | current_ratio = 1; | ||
357 | else if (tsc_now-tsc_last) { | ||
358 | val64 = 100*(msr_now-msr_last); | ||
359 | do_div(val64, (tsc_now-tsc_last)); | ||
360 | current_ratio = val64; | ||
361 | } | ||
362 | |||
363 | /* update record */ | ||
364 | msr_last = msr_now; | ||
365 | tsc_last = tsc_now; | ||
366 | |||
367 | adjust_compensation(target_ratio, win); | ||
368 | /* | ||
369 | * too many external interrupts, set flag such | ||
370 | * that we can take measure later. | ||
371 | */ | ||
372 | reduce_irq = atomic_read(&idle_wakeup_counter) >= | ||
373 | 2 * win * num_online_cpus(); | ||
374 | |||
375 | atomic_set(&idle_wakeup_counter, 0); | ||
376 | /* if we are above target+guard, skip */ | ||
377 | return set_target_ratio + guard <= current_ratio; | ||
378 | } | ||
379 | |||
380 | static void clamp_balancing_func(struct kthread_work *work) | ||
381 | { | ||
382 | struct powerclamp_worker_data *w_data; | ||
383 | int sleeptime; | ||
384 | unsigned long target_jiffies; | ||
385 | unsigned int compensated_ratio; | ||
386 | int interval; /* jiffies to sleep for each attempt */ | ||
387 | |||
388 | w_data = container_of(work, struct powerclamp_worker_data, | ||
389 | balancing_work); | ||
390 | |||
391 | /* | ||
392 | * make sure user selected ratio does not take effect until | ||
393 | * the next round. adjust target_ratio if user has changed | ||
394 | * target such that we can converge quickly. | ||
395 | */ | ||
396 | w_data->target_ratio = READ_ONCE(set_target_ratio); | ||
397 | w_data->guard = 1 + w_data->target_ratio / 20; | ||
398 | w_data->window_size_now = window_size; | ||
399 | w_data->duration_jiffies = msecs_to_jiffies(duration); | ||
400 | w_data->count++; | ||
401 | |||
402 | /* | ||
403 | * systems may have different ability to enter package level | ||
404 | * c-states, thus we need to compensate the injected idle ratio | ||
405 | * to achieve the actual target reported by the HW. | ||
406 | */ | ||
407 | compensated_ratio = w_data->target_ratio + | ||
408 | get_compensation(w_data->target_ratio); | ||
409 | if (compensated_ratio <= 0) | ||
410 | compensated_ratio = 1; | ||
411 | interval = w_data->duration_jiffies * 100 / compensated_ratio; | ||
412 | |||
413 | /* align idle time */ | ||
414 | target_jiffies = roundup(jiffies, interval); | ||
415 | sleeptime = target_jiffies - jiffies; | ||
416 | if (sleeptime <= 0) | ||
417 | sleeptime = 1; | ||
418 | |||
419 | if (clamping && w_data->clamping && cpu_online(w_data->cpu)) | ||
420 | kthread_queue_delayed_work(w_data->worker, | ||
421 | &w_data->idle_injection_work, | ||
422 | sleeptime); | ||
423 | } | ||
424 | |||
425 | static void clamp_idle_injection_func(struct kthread_work *work) | ||
426 | { | ||
427 | struct powerclamp_worker_data *w_data; | ||
428 | |||
429 | w_data = container_of(work, struct powerclamp_worker_data, | ||
430 | idle_injection_work.work); | ||
431 | |||
432 | /* | ||
433 | * only elected controlling cpu can collect stats and update | ||
434 | * control parameters. | ||
435 | */ | ||
436 | if (w_data->cpu == control_cpu && | ||
437 | !(w_data->count % w_data->window_size_now)) { | ||
438 | should_skip = | ||
439 | powerclamp_adjust_controls(w_data->target_ratio, | ||
440 | w_data->guard, | ||
441 | w_data->window_size_now); | ||
442 | smp_mb(); | ||
443 | } | ||
444 | |||
445 | if (should_skip) | ||
446 | goto balance; | ||
447 | |||
448 | play_idle(jiffies_to_msecs(w_data->duration_jiffies)); | ||
449 | |||
450 | balance: | ||
451 | if (clamping && w_data->clamping && cpu_online(w_data->cpu)) | ||
452 | kthread_queue_work(w_data->worker, &w_data->balancing_work); | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * 1 HZ polling while clamping is active, useful for userspace | ||
457 | * to monitor actual idle ratio. | ||
458 | */ | ||
459 | static void poll_pkg_cstate(struct work_struct *dummy); | ||
460 | static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); | ||
461 | static void poll_pkg_cstate(struct work_struct *dummy) | ||
462 | { | ||
463 | static u64 msr_last; | ||
464 | static u64 tsc_last; | ||
465 | |||
466 | u64 msr_now; | ||
467 | u64 tsc_now; | ||
468 | u64 val64; | ||
469 | |||
470 | msr_now = pkg_state_counter(); | ||
471 | tsc_now = rdtsc(); | ||
472 | |||
473 | /* calculate pkg cstate vs tsc ratio */ | ||
474 | if (!msr_last || !tsc_last) | ||
475 | pkg_cstate_ratio_cur = 1; | ||
476 | else { | ||
477 | if (tsc_now - tsc_last) { | ||
478 | val64 = 100 * (msr_now - msr_last); | ||
479 | do_div(val64, (tsc_now - tsc_last)); | ||
480 | pkg_cstate_ratio_cur = val64; | ||
481 | } | ||
482 | } | ||
483 | |||
484 | /* update record */ | ||
485 | msr_last = msr_now; | ||
486 | tsc_last = tsc_now; | ||
487 | |||
488 | if (true == clamping) | ||
489 | schedule_delayed_work(&poll_pkg_cstate_work, HZ); | ||
490 | } | ||
491 | |||
492 | static void start_power_clamp_worker(unsigned long cpu) | ||
493 | { | ||
494 | struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); | ||
495 | struct kthread_worker *worker; | ||
496 | |||
497 | worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); | ||
498 | if (IS_ERR(worker)) | ||
499 | return; | ||
500 | |||
501 | w_data->worker = worker; | ||
502 | w_data->count = 0; | ||
503 | w_data->cpu = cpu; | ||
504 | w_data->clamping = true; | ||
505 | set_bit(cpu, cpu_clamping_mask); | ||
506 | sched_setscheduler(worker->task, SCHED_FIFO, &sparam); | ||
507 | kthread_init_work(&w_data->balancing_work, clamp_balancing_func); | ||
508 | kthread_init_delayed_work(&w_data->idle_injection_work, | ||
509 | clamp_idle_injection_func); | ||
510 | kthread_queue_work(w_data->worker, &w_data->balancing_work); | ||
511 | } | ||
512 | |||
513 | static void stop_power_clamp_worker(unsigned long cpu) | ||
514 | { | ||
515 | struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); | ||
516 | |||
517 | if (!w_data->worker) | ||
518 | return; | ||
519 | |||
520 | w_data->clamping = false; | ||
521 | /* | ||
522 | * Make sure that all works that get queued after this point see | ||
523 | * the clamping disabled. The counter part is not needed because | ||
524 | * there is an implicit memory barrier when the queued work | ||
525 | * is proceed. | ||
526 | */ | ||
527 | smp_wmb(); | ||
528 | kthread_cancel_work_sync(&w_data->balancing_work); | ||
529 | kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); | ||
530 | /* | ||
531 | * The balancing work still might be queued here because | ||
532 | * the handling of the "clapming" variable, cancel, and queue | ||
533 | * operations are not synchronized via a lock. But it is not | ||
534 | * a big deal. The balancing work is fast and destroy kthread | ||
535 | * will wait for it. | ||
536 | */ | ||
537 | clear_bit(w_data->cpu, cpu_clamping_mask); | ||
538 | kthread_destroy_worker(w_data->worker); | ||
539 | |||
540 | w_data->worker = NULL; | ||
541 | } | ||
542 | |||
543 | static int start_power_clamp(void) | ||
544 | { | ||
545 | unsigned long cpu; | ||
546 | |||
547 | set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); | ||
548 | /* prevent cpu hotplug */ | ||
549 | get_online_cpus(); | ||
550 | |||
551 | /* prefer BSP */ | ||
552 | control_cpu = 0; | ||
553 | if (!cpu_online(control_cpu)) | ||
554 | control_cpu = smp_processor_id(); | ||
555 | |||
556 | clamping = true; | ||
557 | schedule_delayed_work(&poll_pkg_cstate_work, 0); | ||
558 | |||
559 | /* start one kthread worker per online cpu */ | ||
560 | for_each_online_cpu(cpu) { | ||
561 | start_power_clamp_worker(cpu); | ||
562 | } | ||
563 | put_online_cpus(); | ||
564 | |||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | static void end_power_clamp(void) | ||
569 | { | ||
570 | int i; | ||
571 | |||
572 | /* | ||
573 | * Block requeuing in all the kthread workers. They will flush and | ||
574 | * stop faster. | ||
575 | */ | ||
576 | clamping = false; | ||
577 | if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { | ||
578 | for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { | ||
579 | pr_debug("clamping worker for cpu %d alive, destroy\n", | ||
580 | i); | ||
581 | stop_power_clamp_worker(i); | ||
582 | } | ||
583 | } | ||
584 | } | ||
585 | |||
586 | static int powerclamp_cpu_online(unsigned int cpu) | ||
587 | { | ||
588 | if (clamping == false) | ||
589 | return 0; | ||
590 | start_power_clamp_worker(cpu); | ||
591 | /* prefer BSP as controlling CPU */ | ||
592 | if (cpu == 0) { | ||
593 | control_cpu = 0; | ||
594 | smp_mb(); | ||
595 | } | ||
596 | return 0; | ||
597 | } | ||
598 | |||
599 | static int powerclamp_cpu_predown(unsigned int cpu) | ||
600 | { | ||
601 | if (clamping == false) | ||
602 | return 0; | ||
603 | |||
604 | stop_power_clamp_worker(cpu); | ||
605 | if (cpu != control_cpu) | ||
606 | return 0; | ||
607 | |||
608 | control_cpu = cpumask_first(cpu_online_mask); | ||
609 | if (control_cpu == cpu) | ||
610 | control_cpu = cpumask_next(cpu, cpu_online_mask); | ||
611 | smp_mb(); | ||
612 | return 0; | ||
613 | } | ||
614 | |||
615 | static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, | ||
616 | unsigned long *state) | ||
617 | { | ||
618 | *state = MAX_TARGET_RATIO; | ||
619 | |||
620 | return 0; | ||
621 | } | ||
622 | |||
623 | static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, | ||
624 | unsigned long *state) | ||
625 | { | ||
626 | if (true == clamping) | ||
627 | *state = pkg_cstate_ratio_cur; | ||
628 | else | ||
629 | /* to save power, do not poll idle ratio while not clamping */ | ||
630 | *state = -1; /* indicates invalid state */ | ||
631 | |||
632 | return 0; | ||
633 | } | ||
634 | |||
635 | static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, | ||
636 | unsigned long new_target_ratio) | ||
637 | { | ||
638 | int ret = 0; | ||
639 | |||
640 | new_target_ratio = clamp(new_target_ratio, 0UL, | ||
641 | (unsigned long) (MAX_TARGET_RATIO-1)); | ||
642 | if (set_target_ratio == 0 && new_target_ratio > 0) { | ||
643 | pr_info("Start idle injection to reduce power\n"); | ||
644 | set_target_ratio = new_target_ratio; | ||
645 | ret = start_power_clamp(); | ||
646 | goto exit_set; | ||
647 | } else if (set_target_ratio > 0 && new_target_ratio == 0) { | ||
648 | pr_info("Stop forced idle injection\n"); | ||
649 | end_power_clamp(); | ||
650 | set_target_ratio = 0; | ||
651 | } else /* adjust currently running */ { | ||
652 | set_target_ratio = new_target_ratio; | ||
653 | /* make new set_target_ratio visible to other cpus */ | ||
654 | smp_mb(); | ||
655 | } | ||
656 | |||
657 | exit_set: | ||
658 | return ret; | ||
659 | } | ||
660 | |||
661 | /* bind to generic thermal layer as cooling device*/ | ||
662 | static struct thermal_cooling_device_ops powerclamp_cooling_ops = { | ||
663 | .get_max_state = powerclamp_get_max_state, | ||
664 | .get_cur_state = powerclamp_get_cur_state, | ||
665 | .set_cur_state = powerclamp_set_cur_state, | ||
666 | }; | ||
667 | |||
668 | static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { | ||
669 | { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT }, | ||
670 | {} | ||
671 | }; | ||
672 | MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); | ||
673 | |||
674 | static int __init powerclamp_probe(void) | ||
675 | { | ||
676 | |||
677 | if (!x86_match_cpu(intel_powerclamp_ids)) { | ||
678 | pr_err("CPU does not support MWAIT\n"); | ||
679 | return -ENODEV; | ||
680 | } | ||
681 | |||
682 | /* The goal for idle time alignment is to achieve package cstate. */ | ||
683 | if (!has_pkg_state_counter()) { | ||
684 | pr_info("No package C-state available\n"); | ||
685 | return -ENODEV; | ||
686 | } | ||
687 | |||
688 | /* find the deepest mwait value */ | ||
689 | find_target_mwait(); | ||
690 | |||
691 | return 0; | ||
692 | } | ||
693 | |||
694 | static int powerclamp_debug_show(struct seq_file *m, void *unused) | ||
695 | { | ||
696 | int i = 0; | ||
697 | |||
698 | seq_printf(m, "controlling cpu: %d\n", control_cpu); | ||
699 | seq_printf(m, "pct confidence steady dynamic (compensation)\n"); | ||
700 | for (i = 0; i < MAX_TARGET_RATIO; i++) { | ||
701 | seq_printf(m, "%d\t%lu\t%lu\t%lu\n", | ||
702 | i, | ||
703 | cal_data[i].confidence, | ||
704 | cal_data[i].steady_comp, | ||
705 | cal_data[i].dynamic_comp); | ||
706 | } | ||
707 | |||
708 | return 0; | ||
709 | } | ||
710 | |||
711 | static int powerclamp_debug_open(struct inode *inode, | ||
712 | struct file *file) | ||
713 | { | ||
714 | return single_open(file, powerclamp_debug_show, inode->i_private); | ||
715 | } | ||
716 | |||
717 | static const struct file_operations powerclamp_debug_fops = { | ||
718 | .open = powerclamp_debug_open, | ||
719 | .read = seq_read, | ||
720 | .llseek = seq_lseek, | ||
721 | .release = single_release, | ||
722 | .owner = THIS_MODULE, | ||
723 | }; | ||
724 | |||
725 | static inline void powerclamp_create_debug_files(void) | ||
726 | { | ||
727 | debug_dir = debugfs_create_dir("intel_powerclamp", NULL); | ||
728 | if (!debug_dir) | ||
729 | return; | ||
730 | |||
731 | if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, | ||
732 | cal_data, &powerclamp_debug_fops)) | ||
733 | goto file_error; | ||
734 | |||
735 | return; | ||
736 | |||
737 | file_error: | ||
738 | debugfs_remove_recursive(debug_dir); | ||
739 | } | ||
740 | |||
741 | static enum cpuhp_state hp_state; | ||
742 | |||
743 | static int __init powerclamp_init(void) | ||
744 | { | ||
745 | int retval; | ||
746 | int bitmap_size; | ||
747 | |||
748 | bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long); | ||
749 | cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL); | ||
750 | if (!cpu_clamping_mask) | ||
751 | return -ENOMEM; | ||
752 | |||
753 | /* probe cpu features and ids here */ | ||
754 | retval = powerclamp_probe(); | ||
755 | if (retval) | ||
756 | goto exit_free; | ||
757 | |||
758 | /* set default limit, maybe adjusted during runtime based on feedback */ | ||
759 | window_size = 2; | ||
760 | retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, | ||
761 | "thermal/intel_powerclamp:online", | ||
762 | powerclamp_cpu_online, | ||
763 | powerclamp_cpu_predown); | ||
764 | if (retval < 0) | ||
765 | goto exit_free; | ||
766 | |||
767 | hp_state = retval; | ||
768 | |||
769 | worker_data = alloc_percpu(struct powerclamp_worker_data); | ||
770 | if (!worker_data) { | ||
771 | retval = -ENOMEM; | ||
772 | goto exit_unregister; | ||
773 | } | ||
774 | |||
775 | cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, | ||
776 | &powerclamp_cooling_ops); | ||
777 | if (IS_ERR(cooling_dev)) { | ||
778 | retval = -ENODEV; | ||
779 | goto exit_free_thread; | ||
780 | } | ||
781 | |||
782 | if (!duration) | ||
783 | duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); | ||
784 | |||
785 | powerclamp_create_debug_files(); | ||
786 | |||
787 | return 0; | ||
788 | |||
789 | exit_free_thread: | ||
790 | free_percpu(worker_data); | ||
791 | exit_unregister: | ||
792 | cpuhp_remove_state_nocalls(hp_state); | ||
793 | exit_free: | ||
794 | kfree(cpu_clamping_mask); | ||
795 | return retval; | ||
796 | } | ||
797 | module_init(powerclamp_init); | ||
798 | |||
799 | static void __exit powerclamp_exit(void) | ||
800 | { | ||
801 | end_power_clamp(); | ||
802 | cpuhp_remove_state_nocalls(hp_state); | ||
803 | free_percpu(worker_data); | ||
804 | thermal_cooling_device_unregister(cooling_dev); | ||
805 | kfree(cpu_clamping_mask); | ||
806 | |||
807 | cancel_delayed_work_sync(&poll_pkg_cstate_work); | ||
808 | debugfs_remove_recursive(debug_dir); | ||
809 | } | ||
810 | module_exit(powerclamp_exit); | ||
811 | |||
812 | MODULE_LICENSE("GPL"); | ||
813 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | ||
814 | MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); | ||
815 | MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs"); | ||