aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/thermal/x86_pkg_temperature_thermal47
-rw-r--r--arch/x86/include/asm/mce.h7
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c63
-rw-r--r--drivers/thermal/Kconfig13
-rw-r--r--drivers/thermal/Makefile1
-rw-r--r--drivers/thermal/x86_pkg_temp_thermal.c642
6 files changed, 769 insertions, 4 deletions
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal b/Documentation/thermal/x86_pkg_temperature_thermal
new file mode 100644
index 000000000000..17a3a4c0a0ca
--- /dev/null
+++ b/Documentation/thermal/x86_pkg_temperature_thermal
@@ -0,0 +1,47 @@
1Kernel driver: x86_pkg_temp_thermal
2===================
3
4Supported chips:
5* x86: with package level thermal management
6(Verify using: CPUID.06H:EAX[bit 6] =1)
7
8Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
9
10Reference
11---
12Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
13Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
14
15Description
16---------
17
18This driver register CPU digital temperature package level sensor as a thermal
19zone with maximum two user mode configurable trip points. Number of trip points
20depends on the capability of the package. Once the trip point is violated,
21user mode can receive notification via thermal notification mechanism and can
22take any action to control temperature.
23
24
25Threshold management
26--------------------
27Each package will register as a thermal zone under /sys/class/thermal.
28Example:
29/sys/class/thermal/thermal_zone1
30
31This contains two trip points:
32- trip_point_0_temp
33- trip_point_1_temp
34
35User can set any temperature between 0 to TJ-Max temperature. Temperature units
36are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.txt" for
37thermal sys-fs details.
38
39Any value other than 0 in these trip points, can trigger thermal notifications.
40Setting 0, stops sending thermal notifications.
41
42Thermal notifications: To get kobject-uevent notifications, set the thermal zone
43policy to "user_space". For example: echo -n "user_space" > policy
44
45
46
47
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fa5f71e021d5..16a214557a58 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -214,6 +214,13 @@ void mce_log_therm_throt_event(__u64 status);
214/* Interrupt Handler for core thermal thresholds */ 214/* Interrupt Handler for core thermal thresholds */
215extern int (*platform_thermal_notify)(__u64 msr_val); 215extern int (*platform_thermal_notify)(__u64 msr_val);
216 216
217/* Interrupt Handler for package thermal thresholds */
218extern int (*platform_thermal_package_notify)(__u64 msr_val);
219
220/* Callback support of rate control, return true, if
221 * callback has rate control */
222extern bool (*platform_thermal_package_rate_control)(void);
223
217#ifdef CONFIG_X86_THERMAL_VECTOR 224#ifdef CONFIG_X86_THERMAL_VECTOR
218extern void mcheck_intel_therm_init(void); 225extern void mcheck_intel_therm_init(void);
219#else 226#else
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 47a1870279aa..4131c0393594 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -54,12 +54,24 @@ struct thermal_state {
54 struct _thermal_state package_power_limit; 54 struct _thermal_state package_power_limit;
55 struct _thermal_state core_thresh0; 55 struct _thermal_state core_thresh0;
56 struct _thermal_state core_thresh1; 56 struct _thermal_state core_thresh1;
57 struct _thermal_state pkg_thresh0;
58 struct _thermal_state pkg_thresh1;
57}; 59};
58 60
59/* Callback to handle core threshold interrupts */ 61/* Callback to handle core threshold interrupts */
60int (*platform_thermal_notify)(__u64 msr_val); 62int (*platform_thermal_notify)(__u64 msr_val);
61EXPORT_SYMBOL(platform_thermal_notify); 63EXPORT_SYMBOL(platform_thermal_notify);
62 64
65/* Callback to handle core package threshold_interrupts */
66int (*platform_thermal_package_notify)(__u64 msr_val);
67EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
68
69/* Callback support of rate control, return true, if
70 * callback has rate control */
71bool (*platform_thermal_package_rate_control)(void);
72EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
73
74
63static DEFINE_PER_CPU(struct thermal_state, thermal_state); 75static DEFINE_PER_CPU(struct thermal_state, thermal_state);
64 76
65static atomic_t therm_throt_en = ATOMIC_INIT(0); 77static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -203,19 +215,25 @@ static int therm_throt_process(bool new_event, int event, int level)
203 return 0; 215 return 0;
204} 216}
205 217
206static int thresh_event_valid(int event) 218static int thresh_event_valid(int level, int event)
207{ 219{
208 struct _thermal_state *state; 220 struct _thermal_state *state;
209 unsigned int this_cpu = smp_processor_id(); 221 unsigned int this_cpu = smp_processor_id();
210 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); 222 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
211 u64 now = get_jiffies_64(); 223 u64 now = get_jiffies_64();
212 224
213 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; 225 if (level == PACKAGE_LEVEL)
226 state = (event == 0) ? &pstate->pkg_thresh0 :
227 &pstate->pkg_thresh1;
228 else
229 state = (event == 0) ? &pstate->core_thresh0 :
230 &pstate->core_thresh1;
214 231
215 if (time_before64(now, state->next_check)) 232 if (time_before64(now, state->next_check))
216 return 0; 233 return 0;
217 234
218 state->next_check = now + CHECK_INTERVAL; 235 state->next_check = now + CHECK_INTERVAL;
236
219 return 1; 237 return 1;
220} 238}
221 239
@@ -321,6 +339,39 @@ device_initcall(thermal_throttle_init_device);
321 339
322#endif /* CONFIG_SYSFS */ 340#endif /* CONFIG_SYSFS */
323 341
342static void notify_package_thresholds(__u64 msr_val)
343{
344 bool notify_thres_0 = false;
345 bool notify_thres_1 = false;
346
347 if (!platform_thermal_package_notify)
348 return;
349
350 /* lower threshold check */
351 if (msr_val & THERM_LOG_THRESHOLD0)
352 notify_thres_0 = true;
353 /* higher threshold check */
354 if (msr_val & THERM_LOG_THRESHOLD1)
355 notify_thres_1 = true;
356
357 if (!notify_thres_0 && !notify_thres_1)
358 return;
359
360 if (platform_thermal_package_rate_control &&
361 platform_thermal_package_rate_control()) {
362 /* Rate control is implemented in callback */
363 platform_thermal_package_notify(msr_val);
364 return;
365 }
366
367 /* lower threshold reached */
368 if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
369 platform_thermal_package_notify(msr_val);
370 /* higher threshold reached */
371 if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
372 platform_thermal_package_notify(msr_val);
373}
374
324static void notify_thresholds(__u64 msr_val) 375static void notify_thresholds(__u64 msr_val)
325{ 376{
326 /* check whether the interrupt handler is defined; 377 /* check whether the interrupt handler is defined;
@@ -330,10 +381,12 @@ static void notify_thresholds(__u64 msr_val)
330 return; 381 return;
331 382
332 /* lower threshold reached */ 383 /* lower threshold reached */
333 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) 384 if ((msr_val & THERM_LOG_THRESHOLD0) &&
385 thresh_event_valid(CORE_LEVEL, 0))
334 platform_thermal_notify(msr_val); 386 platform_thermal_notify(msr_val);
335 /* higher threshold reached */ 387 /* higher threshold reached */
336 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) 388 if ((msr_val & THERM_LOG_THRESHOLD1) &&
389 thresh_event_valid(CORE_LEVEL, 1))
337 platform_thermal_notify(msr_val); 390 platform_thermal_notify(msr_val);
338} 391}
339 392
@@ -359,6 +412,8 @@ static void intel_thermal_interrupt(void)
359 412
360 if (this_cpu_has(X86_FEATURE_PTS)) { 413 if (this_cpu_has(X86_FEATURE_PTS)) {
361 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 414 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
415 /* check violations of package thermal thresholds */
416 notify_package_thresholds(msr_val);
362 therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 417 therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
363 THERMAL_THROTTLING_EVENT, 418 THERMAL_THROTTLING_EVENT,
364 PACKAGE_LEVEL); 419 PACKAGE_LEVEL);
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 7205c70a46a3..b13c2bcccb72 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -169,7 +169,20 @@ config INTEL_POWERCLAMP
169 enforce idle time which results in more package C-state residency. The 169 enforce idle time which results in more package C-state residency. The
170 user interface is exposed via generic thermal framework. 170 user interface is exposed via generic thermal framework.
171 171
172config X86_PKG_TEMP_THERMAL
173 tristate "X86 package temperature thermal driver"
174 depends on THERMAL
175 depends on X86
176 select THERMAL_GOV_USER_SPACE
177 default m
178 help
179 Enable this to register CPU digital sensor for package temperature as
180 thermal zone. Each package will have its own thermal zone. There are
181 two trip points which can be set by user to get notifications via thermal
182 notification methods.
183
172menu "Texas Instruments thermal drivers" 184menu "Texas Instruments thermal drivers"
173source "drivers/thermal/ti-soc-thermal/Kconfig" 185source "drivers/thermal/ti-soc-thermal/Kconfig"
174endmenu 186endmenu
187
175endif 188endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 85693941fda0..67184a293e3f 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -23,4 +23,5 @@ obj-$(CONFIG_DB8500_THERMAL) += db8500_thermal.o
23obj-$(CONFIG_ARMADA_THERMAL) += armada_thermal.o 23obj-$(CONFIG_ARMADA_THERMAL) += armada_thermal.o
24obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o 24obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o
25obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o 25obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
26obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o
26obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/ 27obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/
diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
new file mode 100644
index 000000000000..5de56f671a9d
--- /dev/null
+++ b/drivers/thermal/x86_pkg_temp_thermal.c
@@ -0,0 +1,642 @@
1/*
2 * x86_pkg_temp_thermal driver
3 * Copyright (c) 2013, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.
16 *
17 */
18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/err.h>
23#include <linux/param.h>
24#include <linux/device.h>
25#include <linux/platform_device.h>
26#include <linux/cpu.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/pm.h>
30#include <linux/thermal.h>
31#include <linux/debugfs.h>
32#include <asm/cpu_device_id.h>
33#include <asm/mce.h>
34
35/*
36* Rate control delay: Idea is to introduce denounce effect
37* This should be long enough to avoid reduce events, when
38* threshold is set to a temperature, which is constantly
39* violated, but at the short enough to take any action.
40* The action can be remove threshold or change it to next
41* interesting setting. Based on experiments, in around
42* every 5 seconds under load will give us a significant
43* temperature change.
44*/
45#define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
46static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
47module_param(notify_delay_ms, int, 0644);
48MODULE_PARM_DESC(notify_delay_ms,
49 "User space notification delay in milli seconds.");
50
51/* Number of trip points in thermal zone. Currently it can't
52* be more than 2. MSR can allow setting and getting notifications
53* for only 2 thresholds. This define enforces this, if there
54* is some wrong values returned by cpuid for number of thresholds.
55*/
56#define MAX_NUMBER_OF_TRIPS 2
57
58struct phy_dev_entry {
59 struct list_head list;
60 u16 phys_proc_id;
61 u16 first_cpu;
62 u32 tj_max;
63 int ref_cnt;
64 u32 start_pkg_therm_low;
65 u32 start_pkg_therm_high;
66 struct thermal_zone_device *tzone;
67};
68
69/* List maintaining number of package instances */
70static LIST_HEAD(phy_dev_list);
71static DEFINE_MUTEX(phy_dev_list_mutex);
72
73/* Interrupt to work function schedule queue */
74static DEFINE_PER_CPU(struct delayed_work, pkg_temp_thermal_threshold_work);
75
76/* To track if the work is already scheduled on a package */
77static u8 *pkg_work_scheduled;
78
79/* Spin lock to prevent races with pkg_work_scheduled */
80static spinlock_t pkg_work_lock;
81static u16 max_phy_id;
82
83/* Debug counters to show using debugfs */
84static struct dentry *debugfs;
85static unsigned int pkg_interrupt_cnt;
86static unsigned int pkg_work_cnt;
87
88static int pkg_temp_debugfs_init(void)
89{
90 struct dentry *d;
91
92 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
93 if (!debugfs)
94 return -ENOENT;
95
96 d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
97 (u32 *)&pkg_interrupt_cnt);
98 if (!d)
99 goto err_out;
100
101 d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
102 (u32 *)&pkg_work_cnt);
103 if (!d)
104 goto err_out;
105
106 return 0;
107
108err_out:
109 debugfs_remove_recursive(debugfs);
110 return -ENOENT;
111}
112
113static struct phy_dev_entry
114 *pkg_temp_thermal_get_phy_entry(unsigned int cpu)
115{
116 u16 phys_proc_id = topology_physical_package_id(cpu);
117 struct phy_dev_entry *phy_ptr;
118
119 mutex_lock(&phy_dev_list_mutex);
120
121 list_for_each_entry(phy_ptr, &phy_dev_list, list)
122 if (phy_ptr->phys_proc_id == phys_proc_id) {
123 mutex_unlock(&phy_dev_list_mutex);
124 return phy_ptr;
125 }
126
127 mutex_unlock(&phy_dev_list_mutex);
128
129 return NULL;
130}
131
132/*
133* tj-max is is interesting because threshold is set relative to this
134* temperature.
135*/
136static int get_tj_max(int cpu, u32 *tj_max)
137{
138 u32 eax, edx;
139 u32 val;
140 int err;
141
142 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
143 if (err)
144 goto err_ret;
145 else {
146 val = (eax >> 16) & 0xff;
147 if (val)
148 *tj_max = val * 1000;
149 else {
150 err = -EINVAL;
151 goto err_ret;
152 }
153 }
154
155 return 0;
156err_ret:
157 *tj_max = 0;
158 return err;
159}
160
161static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
162{
163 u32 eax, edx;
164 struct phy_dev_entry *phy_dev_entry;
165
166 phy_dev_entry = tzd->devdata;
167 rdmsr_on_cpu(phy_dev_entry->first_cpu, MSR_IA32_PACKAGE_THERM_STATUS,
168 &eax, &edx);
169 if (eax & 0x80000000) {
170 *temp = phy_dev_entry->tj_max -
171 ((eax >> 16) & 0x7f) * 1000;
172 pr_debug("sys_get_curr_temp %ld\n", *temp);
173 return 0;
174 }
175
176 return -EINVAL;
177}
178
179static int sys_get_trip_temp(struct thermal_zone_device *tzd,
180 int trip, unsigned long *temp)
181{
182 u32 eax, edx;
183 struct phy_dev_entry *phy_dev_entry;
184 u32 mask, shift;
185 unsigned long thres_reg_value;
186 int ret;
187
188 if (trip >= MAX_NUMBER_OF_TRIPS)
189 return -EINVAL;
190
191 phy_dev_entry = tzd->devdata;
192
193 if (trip) {
194 mask = THERM_MASK_THRESHOLD1;
195 shift = THERM_SHIFT_THRESHOLD1;
196 } else {
197 mask = THERM_MASK_THRESHOLD0;
198 shift = THERM_SHIFT_THRESHOLD0;
199 }
200
201 ret = rdmsr_on_cpu(phy_dev_entry->first_cpu,
202 MSR_IA32_PACKAGE_THERM_INTERRUPT, &eax, &edx);
203 if (ret < 0)
204 return -EINVAL;
205
206 thres_reg_value = (eax & mask) >> shift;
207 if (thres_reg_value)
208 *temp = phy_dev_entry->tj_max - thres_reg_value * 1000;
209 else
210 *temp = 0;
211 pr_debug("sys_get_trip_temp %ld\n", *temp);
212
213 return 0;
214}
215
216int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
217 unsigned long temp)
218{
219 u32 l, h;
220 struct phy_dev_entry *phy_dev_entry;
221 u32 mask, shift, intr;
222 int ret;
223
224 phy_dev_entry = tzd->devdata;
225
226 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= phy_dev_entry->tj_max)
227 return -EINVAL;
228
229 ret = rdmsr_on_cpu(phy_dev_entry->first_cpu,
230 MSR_IA32_PACKAGE_THERM_INTERRUPT,
231 &l, &h);
232 if (ret < 0)
233 return -EINVAL;
234
235 if (trip) {
236 mask = THERM_MASK_THRESHOLD1;
237 shift = THERM_SHIFT_THRESHOLD1;
238 intr = THERM_INT_THRESHOLD1_ENABLE;
239 } else {
240 mask = THERM_MASK_THRESHOLD0;
241 shift = THERM_SHIFT_THRESHOLD0;
242 intr = THERM_INT_THRESHOLD0_ENABLE;
243 }
244 l &= ~mask;
245 /*
246 * When users space sets a trip temperature == 0, which is indication
247 * that, it is no longer interested in receiving notifications.
248 */
249 if (!temp)
250 l &= ~intr;
251 else {
252 l |= (phy_dev_entry->tj_max - temp)/1000 << shift;
253 l |= intr;
254 }
255
256 return wrmsr_on_cpu(phy_dev_entry->first_cpu,
257 MSR_IA32_PACKAGE_THERM_INTERRUPT,
258 l, h);
259}
260
261static int sys_get_trip_type(struct thermal_zone_device *thermal,
262 int trip, enum thermal_trip_type *type)
263{
264
265 *type = THERMAL_TRIP_PASSIVE;
266
267 return 0;
268}
269
270/* Thermal zone callback registry */
271static struct thermal_zone_device_ops tzone_ops = {
272 .get_temp = sys_get_curr_temp,
273 .get_trip_temp = sys_get_trip_temp,
274 .get_trip_type = sys_get_trip_type,
275 .set_trip_temp = sys_set_trip_temp,
276};
277
278static bool pkg_temp_thermal_platform_thermal_rate_control(void)
279{
280 return true;
281}
282
283/* Enable threshold interrupt on local package/cpu */
284static inline void enable_pkg_thres_interrupt(void)
285{
286 u32 l, h;
287 u8 thres_0, thres_1;
288
289 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
290 /* only enable/disable if it had valid threshold value */
291 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
292 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
293 if (thres_0)
294 l |= THERM_INT_THRESHOLD0_ENABLE;
295 if (thres_1)
296 l |= THERM_INT_THRESHOLD1_ENABLE;
297 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
298}
299
300/* Disable threshold interrupt on local package/cpu */
301static inline void disable_pkg_thres_interrupt(void)
302{
303 u32 l, h;
304 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
305 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
306 l & (~THERM_INT_THRESHOLD0_ENABLE) &
307 (~THERM_INT_THRESHOLD1_ENABLE), h);
308}
309
310static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
311{
312 __u64 msr_val;
313 int cpu = smp_processor_id();
314 int phy_id = topology_physical_package_id(cpu);
315 struct phy_dev_entry *phdev = pkg_temp_thermal_get_phy_entry(cpu);
316 bool notify = false;
317
318 if (!phdev)
319 return;
320
321 spin_lock(&pkg_work_lock);
322 ++pkg_work_cnt;
323 if (unlikely(phy_id > max_phy_id)) {
324 spin_unlock(&pkg_work_lock);
325 return;
326 }
327 pkg_work_scheduled[phy_id] = 0;
328 spin_unlock(&pkg_work_lock);
329
330 enable_pkg_thres_interrupt();
331 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
332 if (msr_val & THERM_LOG_THRESHOLD0) {
333 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS,
334 msr_val & ~THERM_LOG_THRESHOLD0);
335 notify = true;
336 }
337 if (msr_val & THERM_LOG_THRESHOLD1) {
338 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS,
339 msr_val & ~THERM_LOG_THRESHOLD1);
340 notify = true;
341 }
342 if (notify) {
343 pr_debug("thermal_zone_device_update\n");
344 thermal_zone_device_update(phdev->tzone);
345 }
346}
347
348static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
349{
350 unsigned long flags;
351 int cpu = smp_processor_id();
352 int phy_id = topology_physical_package_id(cpu);
353
354 /*
355 * When a package is in interrupted state, all CPU's in that package
356 * are in the same interrupt state. So scheduling on any one CPU in
357 * the package is enough and simply return for others.
358 */
359 spin_lock_irqsave(&pkg_work_lock, flags);
360 ++pkg_interrupt_cnt;
361 if (unlikely(phy_id > max_phy_id) || unlikely(!pkg_work_scheduled) ||
362 pkg_work_scheduled[phy_id]) {
363 disable_pkg_thres_interrupt();
364 spin_unlock_irqrestore(&pkg_work_lock, flags);
365 return -EINVAL;
366 }
367 pkg_work_scheduled[phy_id] = 1;
368 spin_unlock_irqrestore(&pkg_work_lock, flags);
369
370 disable_pkg_thres_interrupt();
371 schedule_delayed_work_on(cpu,
372 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
373 msecs_to_jiffies(notify_delay_ms));
374 return 0;
375}
376
377static int find_siblings_cpu(int cpu)
378{
379 int i;
380 int id = topology_physical_package_id(cpu);
381
382 for_each_online_cpu(i)
383 if (i != cpu && topology_physical_package_id(i) == id)
384 return i;
385
386 return 0;
387}
388
389static int pkg_temp_thermal_device_add(unsigned int cpu)
390{
391 int err;
392 u32 tj_max;
393 struct phy_dev_entry *phy_dev_entry;
394 char buffer[30];
395 int thres_count;
396 u32 eax, ebx, ecx, edx;
397
398 cpuid(6, &eax, &ebx, &ecx, &edx);
399 thres_count = ebx & 0x07;
400 if (!thres_count)
401 return -ENODEV;
402
403 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
404
405 err = get_tj_max(cpu, &tj_max);
406 if (err)
407 goto err_ret;
408
409 mutex_lock(&phy_dev_list_mutex);
410
411 phy_dev_entry = kzalloc(sizeof(*phy_dev_entry), GFP_KERNEL);
412 if (!phy_dev_entry) {
413 err = -ENOMEM;
414 goto err_ret_unlock;
415 }
416
417 spin_lock(&pkg_work_lock);
418 if (topology_physical_package_id(cpu) > max_phy_id)
419 max_phy_id = topology_physical_package_id(cpu);
420 pkg_work_scheduled = krealloc(pkg_work_scheduled,
421 (max_phy_id+1) * sizeof(u8), GFP_ATOMIC);
422 if (!pkg_work_scheduled) {
423 spin_unlock(&pkg_work_lock);
424 err = -ENOMEM;
425 goto err_ret_free;
426 }
427 pkg_work_scheduled[topology_physical_package_id(cpu)] = 0;
428 spin_unlock(&pkg_work_lock);
429
430 phy_dev_entry->phys_proc_id = topology_physical_package_id(cpu);
431 phy_dev_entry->first_cpu = cpu;
432 phy_dev_entry->tj_max = tj_max;
433 phy_dev_entry->ref_cnt = 1;
434 snprintf(buffer, sizeof(buffer), "pkg-temp-%d\n",
435 phy_dev_entry->phys_proc_id);
436 phy_dev_entry->tzone = thermal_zone_device_register(buffer,
437 thres_count,
438 (thres_count == MAX_NUMBER_OF_TRIPS) ?
439 0x03 : 0x01,
440 phy_dev_entry, &tzone_ops, NULL, 0, 0);
441 if (IS_ERR(phy_dev_entry->tzone)) {
442 err = PTR_ERR(phy_dev_entry->tzone);
443 goto err_ret_free;
444 }
445 /* Store MSR value for package thermal interrupt, to restore at exit */
446 rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
447 &phy_dev_entry->start_pkg_therm_low,
448 &phy_dev_entry->start_pkg_therm_high);
449
450 list_add_tail(&phy_dev_entry->list, &phy_dev_list);
451 pr_debug("pkg_temp_thermal_device_add :phy_id %d cpu %d\n",
452 phy_dev_entry->phys_proc_id, cpu);
453
454 mutex_unlock(&phy_dev_list_mutex);
455
456 return 0;
457
458err_ret_free:
459 kfree(phy_dev_entry);
460err_ret_unlock:
461 mutex_unlock(&phy_dev_list_mutex);
462
463err_ret:
464 return err;
465}
466
467static int pkg_temp_thermal_device_remove(unsigned int cpu)
468{
469 struct phy_dev_entry *n;
470 u16 phys_proc_id = topology_physical_package_id(cpu);
471 struct phy_dev_entry *phdev =
472 pkg_temp_thermal_get_phy_entry(cpu);
473
474 if (!phdev)
475 return -ENODEV;
476
477 mutex_lock(&phy_dev_list_mutex);
478 /* If we are loosing the first cpu for this package, we need change */
479 if (phdev->first_cpu == cpu) {
480 phdev->first_cpu = find_siblings_cpu(cpu);
481 pr_debug("thermal_device_remove: first cpu switched %d\n",
482 phdev->first_cpu);
483 }
484 /*
485 * It is possible that no siblings left as this was the last cpu
486 * going offline. We don't need to worry about this assignment
487 * as the phydev entry will be removed in this case and
488 * thermal zone is removed.
489 */
490 --phdev->ref_cnt;
491 pr_debug("thermal_device_remove: pkg: %d cpu %d ref_cnt %d\n",
492 phys_proc_id, cpu, phdev->ref_cnt);
493 if (!phdev->ref_cnt)
494 list_for_each_entry_safe(phdev, n, &phy_dev_list, list) {
495 if (phdev->phys_proc_id == phys_proc_id) {
496 thermal_zone_device_unregister(phdev->tzone);
497 list_del(&phdev->list);
498 kfree(phdev);
499 break;
500 }
501 }
502 mutex_unlock(&phy_dev_list_mutex);
503
504 return 0;
505}
506
507static int get_core_online(unsigned int cpu)
508{
509 struct cpuinfo_x86 *c = &cpu_data(cpu);
510 struct phy_dev_entry *phdev = pkg_temp_thermal_get_phy_entry(cpu);
511
512 /* Check if there is already an instance for this package */
513 if (!phdev) {
514 if (!cpu_has(c, X86_FEATURE_DTHERM) &&
515 !cpu_has(c, X86_FEATURE_PTS))
516 return -ENODEV;
517 if (pkg_temp_thermal_device_add(cpu))
518 return -ENODEV;
519 } else {
520 mutex_lock(&phy_dev_list_mutex);
521 ++phdev->ref_cnt;
522 pr_debug("get_core_online: cpu %d ref_cnt %d\n",
523 cpu, phdev->ref_cnt);
524 mutex_unlock(&phy_dev_list_mutex);
525 }
526 INIT_DELAYED_WORK(&per_cpu(pkg_temp_thermal_threshold_work, cpu),
527 pkg_temp_thermal_threshold_work_fn);
528
529 pr_debug("get_core_online: cpu %d successful\n", cpu);
530
531 return 0;
532}
533
534static void put_core_offline(unsigned int cpu)
535{
536 if (!pkg_temp_thermal_device_remove(cpu))
537 cancel_delayed_work_sync(
538 &per_cpu(pkg_temp_thermal_threshold_work, cpu));
539
540 pr_debug("put_core_offline: cpu %d\n", cpu);
541}
542
543static int pkg_temp_thermal_cpu_callback(struct notifier_block *nfb,
544 unsigned long action, void *hcpu)
545{
546 unsigned int cpu = (unsigned long) hcpu;
547
548 switch (action) {
549 case CPU_ONLINE:
550 case CPU_DOWN_FAILED:
551 get_core_online(cpu);
552 break;
553 case CPU_DOWN_PREPARE:
554 put_core_offline(cpu);
555 break;
556 }
557 return NOTIFY_OK;
558}
559
560static struct notifier_block pkg_temp_thermal_notifier __refdata = {
561 .notifier_call = pkg_temp_thermal_cpu_callback,
562};
563
564static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
565 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTHERM },
566 {}
567};
568MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
569
570static int __init pkg_temp_thermal_init(void)
571{
572 int i;
573
574 if (!x86_match_cpu(pkg_temp_thermal_ids))
575 return -ENODEV;
576
577 spin_lock_init(&pkg_work_lock);
578 platform_thermal_package_notify =
579 pkg_temp_thermal_platform_thermal_notify;
580 platform_thermal_package_rate_control =
581 pkg_temp_thermal_platform_thermal_rate_control;
582
583 get_online_cpus();
584 for_each_online_cpu(i)
585 if (get_core_online(i))
586 goto err_ret;
587 register_hotcpu_notifier(&pkg_temp_thermal_notifier);
588 put_online_cpus();
589
590 pkg_temp_debugfs_init(); /* Don't care if fails */
591
592 return 0;
593
594err_ret:
595 get_online_cpus();
596 for_each_online_cpu(i)
597 put_core_offline(i);
598 put_online_cpus();
599 kfree(pkg_work_scheduled);
600 platform_thermal_package_notify = NULL;
601 platform_thermal_package_rate_control = NULL;
602
603 return -ENODEV;
604}
605
606static void __exit pkg_temp_thermal_exit(void)
607{
608 struct phy_dev_entry *phdev, *n;
609 int i;
610
611 get_online_cpus();
612 unregister_hotcpu_notifier(&pkg_temp_thermal_notifier);
613 mutex_lock(&phy_dev_list_mutex);
614 list_for_each_entry_safe(phdev, n, &phy_dev_list, list) {
615 /* Retore old MSR value for package thermal interrupt */
616 wrmsr_on_cpu(phdev->first_cpu,
617 MSR_IA32_PACKAGE_THERM_INTERRUPT,
618 phdev->start_pkg_therm_low,
619 phdev->start_pkg_therm_high);
620 thermal_zone_device_unregister(phdev->tzone);
621 list_del(&phdev->list);
622 kfree(phdev);
623 }
624 mutex_unlock(&phy_dev_list_mutex);
625 platform_thermal_package_notify = NULL;
626 platform_thermal_package_rate_control = NULL;
627 for_each_online_cpu(i)
628 cancel_delayed_work_sync(
629 &per_cpu(pkg_temp_thermal_threshold_work, i));
630 put_online_cpus();
631
632 kfree(pkg_work_scheduled);
633
634 debugfs_remove_recursive(debugfs);
635}
636
637module_init(pkg_temp_thermal_init)
638module_exit(pkg_temp_thermal_exit)
639
640MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
641MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
642MODULE_LICENSE("GPL v2");