aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/powercap/intel_rapl.c
diff options
context:
space:
mode:
authorJacob Pan <jacob.jun.pan@linux.intel.com>2013-10-17 13:28:35 -0400
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2013-10-18 07:29:52 -0400
commit2d281d8196e38dd3a4ee9af26621ddde8329f269 (patch)
treef3cce224bd2909635e8b311ae27d6234b9b0d9cc /drivers/powercap/intel_rapl.c
parentbfd1ff6375c82930bfb3b401eee2c96720fa8e84 (diff)
PowerCap: Introduce Intel RAPL power capping driver
The Intel Running Average Power Limit (RAPL) technology provides platform software with the ability to monitor, control, and get notifications on power usage. This feature is present in all Sandy Bridge and later Intel processors. Newer models allow more fine grained controls to be applied. In RAPL, power control is divided into domains, which include package, DRAM controller, CPU core (Power Plane 0), graphics uncore (power plane 1), etc. The purpose of this driver is to expose the RAPL settings to userspace. Overall, RAPL fits in the new powercap class driver in that platform level power capping controls are exposed via this generic interface. This driver is based on an earlier patch from Zhang Rui. However, while the previous work was mainly focused on thermal monitoring the focus here is on the usability from user space perspective. References: https://lkml.org/lkml/2011/5/26/93 Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Diffstat (limited to 'drivers/powercap/intel_rapl.c')
-rw-r--r--drivers/powercap/intel_rapl.c1395
1 files changed, 1395 insertions, 0 deletions
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
new file mode 100644
index 000000000000..2a786c504460
--- /dev/null
+++ b/drivers/powercap/intel_rapl.c
@@ -0,0 +1,1395 @@
1/*
2 * Intel Running Average Power Limit (RAPL) Driver
3 * Copyright (c) 2013, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.
16 *
17 */
18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/list.h>
23#include <linux/types.h>
24#include <linux/device.h>
25#include <linux/slab.h>
26#include <linux/log2.h>
27#include <linux/bitmap.h>
28#include <linux/delay.h>
29#include <linux/sysfs.h>
30#include <linux/cpu.h>
31#include <linux/powercap.h>
32
33#include <asm/processor.h>
34#include <asm/cpu_device_id.h>
35
36/* bitmasks for RAPL MSRs, used by primitive access functions */
37#define ENERGY_STATUS_MASK 0xffffffff
38
39#define POWER_LIMIT1_MASK 0x7FFF
40#define POWER_LIMIT1_ENABLE BIT(15)
41#define POWER_LIMIT1_CLAMP BIT(16)
42
43#define POWER_LIMIT2_MASK (0x7FFFULL<<32)
44#define POWER_LIMIT2_ENABLE BIT_ULL(47)
45#define POWER_LIMIT2_CLAMP BIT_ULL(48)
46#define POWER_PACKAGE_LOCK BIT_ULL(63)
47#define POWER_PP_LOCK BIT(31)
48
49#define TIME_WINDOW1_MASK (0x7FULL<<17)
50#define TIME_WINDOW2_MASK (0x7FULL<<49)
51
52#define POWER_UNIT_OFFSET 0
53#define POWER_UNIT_MASK 0x0F
54
55#define ENERGY_UNIT_OFFSET 0x08
56#define ENERGY_UNIT_MASK 0x1F00
57
58#define TIME_UNIT_OFFSET 0x10
59#define TIME_UNIT_MASK 0xF0000
60
61#define POWER_INFO_MAX_MASK (0x7fffULL<<32)
62#define POWER_INFO_MIN_MASK (0x7fffULL<<16)
63#define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48)
64#define POWER_INFO_THERMAL_SPEC_MASK 0x7fff
65
66#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
67#define PP_POLICY_MASK 0x1F
68
69/* Non HW constants */
70#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
71#define RAPL_PRIMITIVE_DUMMY BIT(2)
72
73/* scale RAPL units to avoid floating point math inside kernel */
74#define POWER_UNIT_SCALE (1000000)
75#define ENERGY_UNIT_SCALE (1000000)
76#define TIME_UNIT_SCALE (1000000)
77
78#define TIME_WINDOW_MAX_MSEC 40000
79#define TIME_WINDOW_MIN_MSEC 250
80
81enum unit_type {
82 ARBITRARY_UNIT, /* no translation */
83 POWER_UNIT,
84 ENERGY_UNIT,
85 TIME_UNIT,
86};
87
88enum rapl_domain_type {
89 RAPL_DOMAIN_PACKAGE, /* entire package/socket */
90 RAPL_DOMAIN_PP0, /* core power plane */
91 RAPL_DOMAIN_PP1, /* graphics uncore */
92 RAPL_DOMAIN_DRAM,/* DRAM control_type */
93 RAPL_DOMAIN_MAX,
94};
95
96enum rapl_domain_msr_id {
97 RAPL_DOMAIN_MSR_LIMIT,
98 RAPL_DOMAIN_MSR_STATUS,
99 RAPL_DOMAIN_MSR_PERF,
100 RAPL_DOMAIN_MSR_POLICY,
101 RAPL_DOMAIN_MSR_INFO,
102 RAPL_DOMAIN_MSR_MAX,
103};
104
105/* per domain data, some are optional */
106enum rapl_primitives {
107 ENERGY_COUNTER,
108 POWER_LIMIT1,
109 POWER_LIMIT2,
110 FW_LOCK,
111
112 PL1_ENABLE, /* power limit 1, aka long term */
113 PL1_CLAMP, /* allow frequency to go below OS request */
114 PL2_ENABLE, /* power limit 2, aka short term, instantaneous */
115 PL2_CLAMP,
116
117 TIME_WINDOW1, /* long term */
118 TIME_WINDOW2, /* short term */
119 THERMAL_SPEC_POWER,
120 MAX_POWER,
121
122 MIN_POWER,
123 MAX_TIME_WINDOW,
124 THROTTLED_TIME,
125 PRIORITY_LEVEL,
126
127 /* below are not raw primitive data */
128 AVERAGE_POWER,
129 NR_RAPL_PRIMITIVES,
130};
131
132#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
133
134/* Can be expanded to include events, etc.*/
135struct rapl_domain_data {
136 u64 primitives[NR_RAPL_PRIMITIVES];
137 unsigned long timestamp;
138};
139
140
141#define DOMAIN_STATE_INACTIVE BIT(0)
142#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1)
143#define DOMAIN_STATE_BIOS_LOCKED BIT(2)
144
145#define NR_POWER_LIMITS (2)
146struct rapl_power_limit {
147 struct powercap_zone_constraint *constraint;
148 int prim_id; /* primitive ID used to enable */
149 struct rapl_domain *domain;
150 const char *name;
151};
152
153static const char pl1_name[] = "long_term";
154static const char pl2_name[] = "short_term";
155
156struct rapl_domain {
157 const char *name;
158 enum rapl_domain_type id;
159 int msrs[RAPL_DOMAIN_MSR_MAX];
160 struct powercap_zone power_zone;
161 struct rapl_domain_data rdd;
162 struct rapl_power_limit rpl[NR_POWER_LIMITS];
163 u64 attr_map; /* track capabilities */
164 unsigned int state;
165 int package_id;
166};
167#define power_zone_to_rapl_domain(_zone) \
168 container_of(_zone, struct rapl_domain, power_zone)
169
170
171/* Each physical package contains multiple domains, these are the common
172 * data across RAPL domains within a package.
173 */
174struct rapl_package {
175 unsigned int id; /* physical package/socket id */
176 unsigned int nr_domains;
177 unsigned long domain_map; /* bit map of active domains */
178 unsigned int power_unit_divisor;
179 unsigned int energy_unit_divisor;
180 unsigned int time_unit_divisor;
181 struct rapl_domain *domains; /* array of domains, sized at runtime */
182 struct powercap_zone *power_zone; /* keep track of parent zone */
183 int nr_cpus; /* active cpus on the package, topology info is lost during
184 * cpu hotplug. so we have to track ourselves.
185 */
186 unsigned long power_limit_irq; /* keep track of package power limit
187 * notify interrupt enable status.
188 */
189 struct list_head plist;
190};
191#define PACKAGE_PLN_INT_SAVED BIT(0)
192#define MAX_PRIM_NAME (32)
193
194/* per domain data. used to describe individual knobs such that access function
195 * can be consolidated into one instead of many inline functions.
196 */
197struct rapl_primitive_info {
198 const char *name;
199 u64 mask;
200 int shift;
201 enum rapl_domain_msr_id id;
202 enum unit_type unit;
203 u32 flag;
204};
205
206#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
207 .name = #p, \
208 .mask = m, \
209 .shift = s, \
210 .id = i, \
211 .unit = u, \
212 .flag = f \
213 }
214
215static void rapl_init_domains(struct rapl_package *rp);
216static int rapl_read_data_raw(struct rapl_domain *rd,
217 enum rapl_primitives prim,
218 bool xlate, u64 *data);
219static int rapl_write_data_raw(struct rapl_domain *rd,
220 enum rapl_primitives prim,
221 unsigned long long value);
222static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value,
223 int to_raw);
224static void package_power_limit_irq_save(int package_id);
225
226static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
227
228static const char * const rapl_domain_names[] = {
229 "package",
230 "core",
231 "uncore",
232 "dram",
233};
234
235static struct powercap_control_type *control_type; /* PowerCap Controller */
236
237/* caller to ensure CPU hotplug lock is held */
238static struct rapl_package *find_package_by_id(int id)
239{
240 struct rapl_package *rp;
241
242 list_for_each_entry(rp, &rapl_packages, plist) {
243 if (rp->id == id)
244 return rp;
245 }
246
247 return NULL;
248}
249
250/* caller to ensure CPU hotplug lock is held */
251static int find_active_cpu_on_package(int package_id)
252{
253 int i;
254
255 for_each_online_cpu(i) {
256 if (topology_physical_package_id(i) == package_id)
257 return i;
258 }
259 /* all CPUs on this package are offline */
260
261 return -ENODEV;
262}
263
264/* caller must hold cpu hotplug lock */
265static void rapl_cleanup_data(void)
266{
267 struct rapl_package *p, *tmp;
268
269 list_for_each_entry_safe(p, tmp, &rapl_packages, plist) {
270 kfree(p->domains);
271 list_del(&p->plist);
272 kfree(p);
273 }
274}
275
276static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
277{
278 struct rapl_domain *rd;
279 u64 energy_now;
280
281 /* prevent CPU hotplug, make sure the RAPL domain does not go
282 * away while reading the counter.
283 */
284 get_online_cpus();
285 rd = power_zone_to_rapl_domain(power_zone);
286
287 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
288 *energy_raw = energy_now;
289 put_online_cpus();
290
291 return 0;
292 }
293 put_online_cpus();
294
295 return -EIO;
296}
297
298static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
299{
300 *energy = rapl_unit_xlate(0, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
301 return 0;
302}
303
304static int release_zone(struct powercap_zone *power_zone)
305{
306 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
307 struct rapl_package *rp;
308
309 /* package zone is the last zone of a package, we can free
310 * memory here since all children has been unregistered.
311 */
312 if (rd->id == RAPL_DOMAIN_PACKAGE) {
313 rp = find_package_by_id(rd->package_id);
314 if (!rp) {
315 dev_warn(&power_zone->dev, "no package id %s\n",
316 rd->name);
317 return -ENODEV;
318 }
319 kfree(rd);
320 rp->domains = NULL;
321 }
322
323 return 0;
324
325}
326
327static int find_nr_power_limit(struct rapl_domain *rd)
328{
329 int i;
330
331 for (i = 0; i < NR_POWER_LIMITS; i++) {
332 if (rd->rpl[i].name == NULL)
333 break;
334 }
335
336 return i;
337}
338
339static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
340{
341 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
342 int nr_powerlimit;
343
344 if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
345 return -EACCES;
346 get_online_cpus();
347 nr_powerlimit = find_nr_power_limit(rd);
348 /* here we activate/deactivate the hardware for power limiting */
349 rapl_write_data_raw(rd, PL1_ENABLE, mode);
350 /* always enable clamp such that p-state can go below OS requested
351 * range. power capping priority over guranteed frequency.
352 */
353 rapl_write_data_raw(rd, PL1_CLAMP, mode);
354 /* some domains have pl2 */
355 if (nr_powerlimit > 1) {
356 rapl_write_data_raw(rd, PL2_ENABLE, mode);
357 rapl_write_data_raw(rd, PL2_CLAMP, mode);
358 }
359 put_online_cpus();
360
361 return 0;
362}
363
364static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
365{
366 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
367 u64 val;
368
369 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
370 *mode = false;
371 return 0;
372 }
373 get_online_cpus();
374 if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
375 put_online_cpus();
376 return -EIO;
377 }
378 *mode = val;
379 put_online_cpus();
380
381 return 0;
382}
383
384/* per RAPL domain ops, in the order of rapl_domain_type */
385static struct powercap_zone_ops zone_ops[] = {
386 /* RAPL_DOMAIN_PACKAGE */
387 {
388 .get_energy_uj = get_energy_counter,
389 .get_max_energy_range_uj = get_max_energy_counter,
390 .release = release_zone,
391 .set_enable = set_domain_enable,
392 .get_enable = get_domain_enable,
393 },
394 /* RAPL_DOMAIN_PP0 */
395 {
396 .get_energy_uj = get_energy_counter,
397 .get_max_energy_range_uj = get_max_energy_counter,
398 .release = release_zone,
399 .set_enable = set_domain_enable,
400 .get_enable = get_domain_enable,
401 },
402 /* RAPL_DOMAIN_PP1 */
403 {
404 .get_energy_uj = get_energy_counter,
405 .get_max_energy_range_uj = get_max_energy_counter,
406 .release = release_zone,
407 .set_enable = set_domain_enable,
408 .get_enable = get_domain_enable,
409 },
410 /* RAPL_DOMAIN_DRAM */
411 {
412 .get_energy_uj = get_energy_counter,
413 .get_max_energy_range_uj = get_max_energy_counter,
414 .release = release_zone,
415 .set_enable = set_domain_enable,
416 .get_enable = get_domain_enable,
417 },
418};
419
420static int set_power_limit(struct powercap_zone *power_zone, int id,
421 u64 power_limit)
422{
423 struct rapl_domain *rd;
424 struct rapl_package *rp;
425 int ret = 0;
426
427 get_online_cpus();
428 rd = power_zone_to_rapl_domain(power_zone);
429 rp = find_package_by_id(rd->package_id);
430 if (!rp) {
431 ret = -ENODEV;
432 goto set_exit;
433 }
434
435 if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
436 dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
437 rd->name);
438 ret = -EACCES;
439 goto set_exit;
440 }
441
442 switch (rd->rpl[id].prim_id) {
443 case PL1_ENABLE:
444 rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
445 break;
446 case PL2_ENABLE:
447 rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
448 break;
449 default:
450 ret = -EINVAL;
451 }
452 if (!ret)
453 package_power_limit_irq_save(rd->package_id);
454set_exit:
455 put_online_cpus();
456 return ret;
457}
458
459static int get_current_power_limit(struct powercap_zone *power_zone, int id,
460 u64 *data)
461{
462 struct rapl_domain *rd;
463 u64 val;
464 int prim;
465 int ret = 0;
466
467 get_online_cpus();
468 rd = power_zone_to_rapl_domain(power_zone);
469 switch (rd->rpl[id].prim_id) {
470 case PL1_ENABLE:
471 prim = POWER_LIMIT1;
472 break;
473 case PL2_ENABLE:
474 prim = POWER_LIMIT2;
475 break;
476 default:
477 put_online_cpus();
478 return -EINVAL;
479 }
480 if (rapl_read_data_raw(rd, prim, true, &val))
481 ret = -EIO;
482 else
483 *data = val;
484
485 put_online_cpus();
486
487 return ret;
488}
489
490static int set_time_window(struct powercap_zone *power_zone, int id,
491 u64 window)
492{
493 struct rapl_domain *rd;
494 int ret = 0;
495
496 get_online_cpus();
497 rd = power_zone_to_rapl_domain(power_zone);
498 switch (rd->rpl[id].prim_id) {
499 case PL1_ENABLE:
500 rapl_write_data_raw(rd, TIME_WINDOW1, window);
501 break;
502 case PL2_ENABLE:
503 rapl_write_data_raw(rd, TIME_WINDOW2, window);
504 break;
505 default:
506 ret = -EINVAL;
507 }
508 put_online_cpus();
509 return ret;
510}
511
512static int get_time_window(struct powercap_zone *power_zone, int id, u64 *data)
513{
514 struct rapl_domain *rd;
515 u64 val;
516 int ret = 0;
517
518 get_online_cpus();
519 rd = power_zone_to_rapl_domain(power_zone);
520 switch (rd->rpl[id].prim_id) {
521 case PL1_ENABLE:
522 ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
523 break;
524 case PL2_ENABLE:
525 ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
526 break;
527 default:
528 put_online_cpus();
529 return -EINVAL;
530 }
531 if (!ret)
532 *data = val;
533 put_online_cpus();
534
535 return ret;
536}
537
538static const char *get_constraint_name(struct powercap_zone *power_zone, int id)
539{
540 struct rapl_power_limit *rpl;
541 struct rapl_domain *rd;
542
543 rd = power_zone_to_rapl_domain(power_zone);
544 rpl = (struct rapl_power_limit *) &rd->rpl[id];
545
546 return rpl->name;
547}
548
549
550static int get_max_power(struct powercap_zone *power_zone, int id,
551 u64 *data)
552{
553 struct rapl_domain *rd;
554 u64 val;
555 int prim;
556 int ret = 0;
557
558 get_online_cpus();
559 rd = power_zone_to_rapl_domain(power_zone);
560 switch (rd->rpl[id].prim_id) {
561 case PL1_ENABLE:
562 prim = THERMAL_SPEC_POWER;
563 break;
564 case PL2_ENABLE:
565 prim = MAX_POWER;
566 break;
567 default:
568 put_online_cpus();
569 return -EINVAL;
570 }
571 if (rapl_read_data_raw(rd, prim, true, &val))
572 ret = -EIO;
573 else
574 *data = val;
575
576 put_online_cpus();
577
578 return ret;
579}
580
581static struct powercap_zone_constraint_ops constraint_ops = {
582 .set_power_limit_uw = set_power_limit,
583 .get_power_limit_uw = get_current_power_limit,
584 .set_time_window_us = set_time_window,
585 .get_time_window_us = get_time_window,
586 .get_max_power_uw = get_max_power,
587 .get_name = get_constraint_name,
588};
589
590/* called after domain detection and package level data are set */
591static void rapl_init_domains(struct rapl_package *rp)
592{
593 int i;
594 struct rapl_domain *rd = rp->domains;
595
596 for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
597 unsigned int mask = rp->domain_map & (1 << i);
598 switch (mask) {
599 case BIT(RAPL_DOMAIN_PACKAGE):
600 rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
601 rd->id = RAPL_DOMAIN_PACKAGE;
602 rd->msrs[0] = MSR_PKG_POWER_LIMIT;
603 rd->msrs[1] = MSR_PKG_ENERGY_STATUS;
604 rd->msrs[2] = MSR_PKG_PERF_STATUS;
605 rd->msrs[3] = 0;
606 rd->msrs[4] = MSR_PKG_POWER_INFO;
607 rd->rpl[0].prim_id = PL1_ENABLE;
608 rd->rpl[0].name = pl1_name;
609 rd->rpl[1].prim_id = PL2_ENABLE;
610 rd->rpl[1].name = pl2_name;
611 break;
612 case BIT(RAPL_DOMAIN_PP0):
613 rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
614 rd->id = RAPL_DOMAIN_PP0;
615 rd->msrs[0] = MSR_PP0_POWER_LIMIT;
616 rd->msrs[1] = MSR_PP0_ENERGY_STATUS;
617 rd->msrs[2] = 0;
618 rd->msrs[3] = MSR_PP0_POLICY;
619 rd->msrs[4] = 0;
620 rd->rpl[0].prim_id = PL1_ENABLE;
621 rd->rpl[0].name = pl1_name;
622 break;
623 case BIT(RAPL_DOMAIN_PP1):
624 rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
625 rd->id = RAPL_DOMAIN_PP1;
626 rd->msrs[0] = MSR_PP1_POWER_LIMIT;
627 rd->msrs[1] = MSR_PP1_ENERGY_STATUS;
628 rd->msrs[2] = 0;
629 rd->msrs[3] = MSR_PP1_POLICY;
630 rd->msrs[4] = 0;
631 rd->rpl[0].prim_id = PL1_ENABLE;
632 rd->rpl[0].name = pl1_name;
633 break;
634 case BIT(RAPL_DOMAIN_DRAM):
635 rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
636 rd->id = RAPL_DOMAIN_DRAM;
637 rd->msrs[0] = MSR_DRAM_POWER_LIMIT;
638 rd->msrs[1] = MSR_DRAM_ENERGY_STATUS;
639 rd->msrs[2] = MSR_DRAM_PERF_STATUS;
640 rd->msrs[3] = 0;
641 rd->msrs[4] = MSR_DRAM_POWER_INFO;
642 rd->rpl[0].prim_id = PL1_ENABLE;
643 rd->rpl[0].name = pl1_name;
644 break;
645 }
646 if (mask) {
647 rd->package_id = rp->id;
648 rd++;
649 }
650 }
651}
652
653static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value,
654 int to_raw)
655{
656 u64 divisor = 1;
657 int scale = 1; /* scale to user friendly data without floating point */
658 u64 f, y; /* fraction and exp. used for time unit */
659 struct rapl_package *rp;
660
661 rp = find_package_by_id(package);
662 if (!rp)
663 return value;
664
665 switch (type) {
666 case POWER_UNIT:
667 divisor = rp->power_unit_divisor;
668 scale = POWER_UNIT_SCALE;
669 break;
670 case ENERGY_UNIT:
671 scale = ENERGY_UNIT_SCALE;
672 divisor = rp->energy_unit_divisor;
673 break;
674 case TIME_UNIT:
675 divisor = rp->time_unit_divisor;
676 scale = TIME_UNIT_SCALE;
677 /* special processing based on 2^Y*(1+F)/4 = val/divisor, refer
678 * to Intel Software Developer's manual Vol. 3a, CH 14.7.4.
679 */
680 if (!to_raw) {
681 f = (value & 0x60) >> 5;
682 y = value & 0x1f;
683 value = (1 << y) * (4 + f) * scale / 4;
684 return div64_u64(value, divisor);
685 } else {
686 do_div(value, scale);
687 value *= divisor;
688 y = ilog2(value);
689 f = div64_u64(4 * (value - (1 << y)), 1 << y);
690 value = (y & 0x1f) | ((f & 0x3) << 5);
691 return value;
692 }
693 break;
694 case ARBITRARY_UNIT:
695 default:
696 return value;
697 };
698
699 if (to_raw)
700 return div64_u64(value * divisor, scale);
701 else
702 return div64_u64(value * scale, divisor);
703}
704
705/* in the order of enum rapl_primitives */
706static struct rapl_primitive_info rpi[] = {
707 /* name, mask, shift, msr index, unit divisor */
708 PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
709 RAPL_DOMAIN_MSR_STATUS, ENERGY_UNIT, 0),
710 PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
711 RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0),
712 PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
713 RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0),
714 PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
715 RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
716 PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
717 RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
718 PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
719 RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
720 PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
721 RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
722 PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
723 RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
724 PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
725 RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0),
726 PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
727 RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0),
728 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
729 0, RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
730 PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
731 RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
732 PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
733 RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
734 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
735 RAPL_DOMAIN_MSR_INFO, TIME_UNIT, 0),
736 PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
737 RAPL_DOMAIN_MSR_PERF, TIME_UNIT, 0),
738 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
739 RAPL_DOMAIN_MSR_POLICY, ARBITRARY_UNIT, 0),
740 /* non-hardware */
741 PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
742 RAPL_PRIMITIVE_DERIVED),
743 {NULL, 0, 0, 0},
744};
745
746/* Read primitive data based on its related struct rapl_primitive_info.
747 * if xlate flag is set, return translated data based on data units, i.e.
748 * time, energy, and power.
749 * RAPL MSRs are non-architectual and are laid out not consistently across
750 * domains. Here we use primitive info to allow writing consolidated access
751 * functions.
752 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
753 * is pre-assigned based on RAPL unit MSRs read at init time.
754 * 63-------------------------- 31--------------------------- 0
755 * | xxxxx (mask) |
756 * | |<- shift ----------------|
757 * 63-------------------------- 31--------------------------- 0
758 */
759static int rapl_read_data_raw(struct rapl_domain *rd,
760 enum rapl_primitives prim,
761 bool xlate, u64 *data)
762{
763 u64 value, final;
764 u32 msr;
765 struct rapl_primitive_info *rp = &rpi[prim];
766 int cpu;
767
768 if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
769 return -EINVAL;
770
771 msr = rd->msrs[rp->id];
772 if (!msr)
773 return -EINVAL;
774 /* use physical package id to look up active cpus */
775 cpu = find_active_cpu_on_package(rd->package_id);
776 if (cpu < 0)
777 return cpu;
778
779 /* special-case package domain, which uses a different bit*/
780 if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
781 rp->mask = POWER_PACKAGE_LOCK;
782 rp->shift = 63;
783 }
784 /* non-hardware data are collected by the polling thread */
785 if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
786 *data = rd->rdd.primitives[prim];
787 return 0;
788 }
789
790 if (rdmsrl_safe_on_cpu(cpu, msr, &value)) {
791 pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
792 return -EIO;
793 }
794
795 final = value & rp->mask;
796 final = final >> rp->shift;
797 if (xlate)
798 *data = rapl_unit_xlate(rd->package_id, rp->unit, final, 0);
799 else
800 *data = final;
801
802 return 0;
803}
804
805/* Similar use of primitive info in the read counterpart */
806static int rapl_write_data_raw(struct rapl_domain *rd,
807 enum rapl_primitives prim,
808 unsigned long long value)
809{
810 u64 msr_val;
811 u32 msr;
812 struct rapl_primitive_info *rp = &rpi[prim];
813 int cpu;
814
815 cpu = find_active_cpu_on_package(rd->package_id);
816 if (cpu < 0)
817 return cpu;
818 msr = rd->msrs[rp->id];
819 if (rdmsrl_safe_on_cpu(cpu, msr, &msr_val)) {
820 dev_dbg(&rd->power_zone.dev,
821 "failed to read msr 0x%x on cpu %d\n", msr, cpu);
822 return -EIO;
823 }
824 value = rapl_unit_xlate(rd->package_id, rp->unit, value, 1);
825 msr_val &= ~rp->mask;
826 msr_val |= value << rp->shift;
827 if (wrmsrl_safe_on_cpu(cpu, msr, msr_val)) {
828 dev_dbg(&rd->power_zone.dev,
829 "failed to write msr 0x%x on cpu %d\n", msr, cpu);
830 return -EIO;
831 }
832
833 return 0;
834}
835
836static int rapl_check_unit(struct rapl_package *rp, int cpu)
837{
838 u64 msr_val;
839 u32 value;
840
841 if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
842 pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
843 MSR_RAPL_POWER_UNIT, cpu);
844 return -ENODEV;
845 }
846
847 /* Raw RAPL data stored in MSRs are in certain scales. We need to
848 * convert them into standard units based on the divisors reported in
849 * the RAPL unit MSRs.
850 * i.e.
851 * energy unit: 1/enery_unit_divisor Joules
852 * power unit: 1/power_unit_divisor Watts
853 * time unit: 1/time_unit_divisor Seconds
854 */
855 value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
856 rp->energy_unit_divisor = 1 << value;
857
858
859 value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
860 rp->power_unit_divisor = 1 << value;
861
862 value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
863 rp->time_unit_divisor = 1 << value;
864
865 pr_debug("Physical package %d units: energy=%d, time=%d, power=%d\n",
866 rp->id,
867 rp->energy_unit_divisor,
868 rp->time_unit_divisor,
869 rp->power_unit_divisor);
870
871 return 0;
872}
873
874/* REVISIT:
875 * When package power limit is set artificially low by RAPL, LVT
876 * thermal interrupt for package power limit should be ignored
877 * since we are not really exceeding the real limit. The intention
878 * is to avoid excessive interrupts while we are trying to save power.
879 * A useful feature might be routing the package_power_limit interrupt
880 * to userspace via eventfd. once we have a usecase, this is simple
881 * to do by adding an atomic notifier.
882 */
883
884static void package_power_limit_irq_save(int package_id)
885{
886 u32 l, h = 0;
887 int cpu;
888 struct rapl_package *rp;
889
890 rp = find_package_by_id(package_id);
891 if (!rp)
892 return;
893
894 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
895 return;
896
897 cpu = find_active_cpu_on_package(package_id);
898 if (cpu < 0)
899 return;
900 /* save the state of PLN irq mask bit before disabling it */
901 rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
902 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
903 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
904 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
905 }
906 l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
907 wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
908}
909
910/* restore per package power limit interrupt enable state */
911static void package_power_limit_irq_restore(int package_id)
912{
913 u32 l, h;
914 int cpu;
915 struct rapl_package *rp;
916
917 rp = find_package_by_id(package_id);
918 if (!rp)
919 return;
920
921 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
922 return;
923
924 cpu = find_active_cpu_on_package(package_id);
925 if (cpu < 0)
926 return;
927
928 /* irq enable state not saved, nothing to restore */
929 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
930 return;
931 rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
932
933 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
934 l |= PACKAGE_THERM_INT_PLN_ENABLE;
935 else
936 l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
937
938 wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
939}
940
941static const struct x86_cpu_id rapl_ids[] = {
942 { X86_VENDOR_INTEL, 6, 0x2a},/* SNB */
943 { X86_VENDOR_INTEL, 6, 0x2d},/* SNB EP */
944 { X86_VENDOR_INTEL, 6, 0x3a},/* IVB */
945 { X86_VENDOR_INTEL, 6, 0x45},/* HSW */
946 /* TODO: Add more CPU IDs after testing */
947 {}
948};
949MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
950
951/* read once for all raw primitive data for all packages, domains */
952static void rapl_update_domain_data(void)
953{
954 int dmn, prim;
955 u64 val;
956 struct rapl_package *rp;
957
958 list_for_each_entry(rp, &rapl_packages, plist) {
959 for (dmn = 0; dmn < rp->nr_domains; dmn++) {
960 pr_debug("update package %d domain %s data\n", rp->id,
961 rp->domains[dmn].name);
962 /* exclude non-raw primitives */
963 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++)
964 if (!rapl_read_data_raw(&rp->domains[dmn], prim,
965 rpi[prim].unit,
966 &val))
967 rp->domains[dmn].rdd.primitives[prim] =
968 val;
969 }
970 }
971
972}
973
974static int rapl_unregister_powercap(void)
975{
976 struct rapl_package *rp;
977 struct rapl_domain *rd, *rd_package = NULL;
978
979 /* unregister all active rapl packages from the powercap layer,
980 * hotplug lock held
981 */
982 list_for_each_entry(rp, &rapl_packages, plist) {
983 package_power_limit_irq_restore(rp->id);
984
985 for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
986 rd++) {
987 pr_debug("remove package, undo power limit on %d: %s\n",
988 rp->id, rd->name);
989 rapl_write_data_raw(rd, PL1_ENABLE, 0);
990 rapl_write_data_raw(rd, PL2_ENABLE, 0);
991 rapl_write_data_raw(rd, PL1_CLAMP, 0);
992 rapl_write_data_raw(rd, PL2_CLAMP, 0);
993 if (rd->id == RAPL_DOMAIN_PACKAGE) {
994 rd_package = rd;
995 continue;
996 }
997 powercap_unregister_zone(control_type, &rd->power_zone);
998 }
999 /* do the package zone last */
1000 if (rd_package)
1001 powercap_unregister_zone(control_type,
1002 &rd_package->power_zone);
1003 }
1004 powercap_unregister_control_type(control_type);
1005
1006 return 0;
1007}
1008
1009static int rapl_package_register_powercap(struct rapl_package *rp)
1010{
1011 struct rapl_domain *rd;
1012 int ret = 0;
1013 char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
1014 struct powercap_zone *power_zone = NULL;
1015 int nr_pl;
1016
1017 /* first we register package domain as the parent zone*/
1018 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1019 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1020 nr_pl = find_nr_power_limit(rd);
1021 pr_debug("register socket %d package domain %s\n",
1022 rp->id, rd->name);
1023 memset(dev_name, 0, sizeof(dev_name));
1024 snprintf(dev_name, sizeof(dev_name), "%s-%d",
1025 rd->name, rp->id);
1026 power_zone = powercap_register_zone(&rd->power_zone,
1027 control_type,
1028 dev_name, NULL,
1029 &zone_ops[rd->id],
1030 nr_pl,
1031 &constraint_ops);
1032 if (IS_ERR(power_zone)) {
1033 pr_debug("failed to register package, %d\n",
1034 rp->id);
1035 ret = PTR_ERR(power_zone);
1036 goto exit_package;
1037 }
1038 /* track parent zone in per package/socket data */
1039 rp->power_zone = power_zone;
1040 /* done, only one package domain per socket */
1041 break;
1042 }
1043 }
1044 if (!power_zone) {
1045 pr_err("no package domain found, unknown topology!\n");
1046 ret = -ENODEV;
1047 goto exit_package;
1048 }
1049 /* now register domains as children of the socket/package*/
1050 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1051 if (rd->id == RAPL_DOMAIN_PACKAGE)
1052 continue;
1053 /* number of power limits per domain varies */
1054 nr_pl = find_nr_power_limit(rd);
1055 power_zone = powercap_register_zone(&rd->power_zone,
1056 control_type, rd->name,
1057 rp->power_zone,
1058 &zone_ops[rd->id], nr_pl,
1059 &constraint_ops);
1060
1061 if (IS_ERR(power_zone)) {
1062 pr_debug("failed to register power_zone, %d:%s:%s\n",
1063 rp->id, rd->name, dev_name);
1064 ret = PTR_ERR(power_zone);
1065 goto err_cleanup;
1066 }
1067 }
1068
1069exit_package:
1070 return ret;
1071err_cleanup:
1072 /* clean up previously initialized domains within the package if we
1073 * failed after the first domain setup.
1074 */
1075 while (--rd >= rp->domains) {
1076 pr_debug("unregister package %d domain %s\n", rp->id, rd->name);
1077 powercap_unregister_zone(control_type, &rd->power_zone);
1078 }
1079
1080 return ret;
1081}
1082
1083static int rapl_register_powercap(void)
1084{
1085 struct rapl_domain *rd;
1086 struct rapl_package *rp;
1087 int ret = 0;
1088
1089 control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
1090 if (IS_ERR(control_type)) {
1091 pr_debug("failed to register powercap control_type.\n");
1092 return PTR_ERR(control_type);
1093 }
1094 /* read the initial data */
1095 rapl_update_domain_data();
1096 list_for_each_entry(rp, &rapl_packages, plist)
1097 if (rapl_package_register_powercap(rp))
1098 goto err_cleanup_package;
1099 return ret;
1100
1101err_cleanup_package:
1102 /* clean up previously initialized packages */
1103 list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) {
1104 for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
1105 rd++) {
1106 pr_debug("unregister zone/package %d, %s domain\n",
1107 rp->id, rd->name);
1108 powercap_unregister_zone(control_type, &rd->power_zone);
1109 }
1110 }
1111
1112 return ret;
1113}
1114
1115static int rapl_check_domain(int cpu, int domain)
1116{
1117 unsigned msr;
1118 u64 val1, val2 = 0;
1119 int retry = 0;
1120
1121 switch (domain) {
1122 case RAPL_DOMAIN_PACKAGE:
1123 msr = MSR_PKG_ENERGY_STATUS;
1124 break;
1125 case RAPL_DOMAIN_PP0:
1126 msr = MSR_PP0_ENERGY_STATUS;
1127 break;
1128 case RAPL_DOMAIN_PP1:
1129 msr = MSR_PP1_ENERGY_STATUS;
1130 break;
1131 case RAPL_DOMAIN_DRAM:
1132 msr = MSR_DRAM_ENERGY_STATUS;
1133 break;
1134 default:
1135 pr_err("invalid domain id %d\n", domain);
1136 return -EINVAL;
1137 }
1138 if (rdmsrl_safe_on_cpu(cpu, msr, &val1))
1139 return -ENODEV;
1140
1141 /* energy counters roll slowly on some domains */
1142 while (++retry < 10) {
1143 usleep_range(10000, 15000);
1144 rdmsrl_safe_on_cpu(cpu, msr, &val2);
1145 if ((val1 & ENERGY_STATUS_MASK) != (val2 & ENERGY_STATUS_MASK))
1146 return 0;
1147 }
1148 /* if energy counter does not change, report as bad domain */
1149 pr_info("domain %s energy ctr %llu:%llu not working, skip\n",
1150 rapl_domain_names[domain], val1, val2);
1151
1152 return -ENODEV;
1153}
1154
1155/* Detect active and valid domains for the given CPU, caller must
1156 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1157 */
1158static int rapl_detect_domains(struct rapl_package *rp, int cpu)
1159{
1160 int i;
1161 int ret = 0;
1162 struct rapl_domain *rd;
1163 u64 locked;
1164
1165 for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
1166 /* use physical package id to read counters */
1167 if (!rapl_check_domain(cpu, i))
1168 rp->domain_map |= 1 << i;
1169 }
1170 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
1171 if (!rp->nr_domains) {
1172 pr_err("no valid rapl domains found in package %d\n", rp->id);
1173 ret = -ENODEV;
1174 goto done;
1175 }
1176 pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id);
1177
1178 rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
1179 GFP_KERNEL);
1180 if (!rp->domains) {
1181 ret = -ENOMEM;
1182 goto done;
1183 }
1184 rapl_init_domains(rp);
1185
1186 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1187 /* check if the domain is locked by BIOS */
1188 if (rapl_read_data_raw(rd, FW_LOCK, false, &locked)) {
1189 pr_info("RAPL package %d domain %s locked by BIOS\n",
1190 rp->id, rd->name);
1191 rd->state |= DOMAIN_STATE_BIOS_LOCKED;
1192 }
1193 }
1194
1195
1196done:
1197 return ret;
1198}
1199
1200static bool is_package_new(int package)
1201{
1202 struct rapl_package *rp;
1203
1204 /* caller prevents cpu hotplug, there will be no new packages added
1205 * or deleted while traversing the package list, no need for locking.
1206 */
1207 list_for_each_entry(rp, &rapl_packages, plist)
1208 if (package == rp->id)
1209 return false;
1210
1211 return true;
1212}
1213
1214/* RAPL interface can be made of a two-level hierarchy: package level and domain
1215 * level. We first detect the number of packages then domains of each package.
1216 * We have to consider the possiblity of CPU online/offline due to hotplug and
1217 * other scenarios.
1218 */
1219static int rapl_detect_topology(void)
1220{
1221 int i;
1222 int phy_package_id;
1223 struct rapl_package *new_package, *rp;
1224
1225 for_each_online_cpu(i) {
1226 phy_package_id = topology_physical_package_id(i);
1227 if (is_package_new(phy_package_id)) {
1228 new_package = kzalloc(sizeof(*rp), GFP_KERNEL);
1229 if (!new_package) {
1230 rapl_cleanup_data();
1231 return -ENOMEM;
1232 }
1233 /* add the new package to the list */
1234 new_package->id = phy_package_id;
1235 new_package->nr_cpus = 1;
1236
1237 /* check if the package contains valid domains */
1238 if (rapl_detect_domains(new_package, i) ||
1239 rapl_check_unit(new_package, i)) {
1240 kfree(new_package->domains);
1241 kfree(new_package);
1242 /* free up the packages already initialized */
1243 rapl_cleanup_data();
1244 return -ENODEV;
1245 }
1246 INIT_LIST_HEAD(&new_package->plist);
1247 list_add(&new_package->plist, &rapl_packages);
1248 } else {
1249 rp = find_package_by_id(phy_package_id);
1250 if (rp)
1251 ++rp->nr_cpus;
1252 }
1253 }
1254
1255 return 0;
1256}
1257
1258/* called from CPU hotplug notifier, hotplug lock held */
1259static void rapl_remove_package(struct rapl_package *rp)
1260{
1261 struct rapl_domain *rd, *rd_package = NULL;
1262
1263 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
1264 if (rd->id == RAPL_DOMAIN_PACKAGE) {
1265 rd_package = rd;
1266 continue;
1267 }
1268 pr_debug("remove package %d, %s domain\n", rp->id, rd->name);
1269 powercap_unregister_zone(control_type, &rd->power_zone);
1270 }
1271 /* do parent zone last */
1272 powercap_unregister_zone(control_type, &rd_package->power_zone);
1273 list_del(&rp->plist);
1274 kfree(rp);
1275}
1276
1277/* called from CPU hotplug notifier, hotplug lock held */
1278static int rapl_add_package(int cpu)
1279{
1280 int ret = 0;
1281 int phy_package_id;
1282 struct rapl_package *rp;
1283
1284 phy_package_id = topology_physical_package_id(cpu);
1285 rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
1286 if (!rp)
1287 return -ENOMEM;
1288
1289 /* add the new package to the list */
1290 rp->id = phy_package_id;
1291 rp->nr_cpus = 1;
1292 /* check if the package contains valid domains */
1293 if (rapl_detect_domains(rp, cpu) ||
1294 rapl_check_unit(rp, cpu)) {
1295 ret = -ENODEV;
1296 goto err_free_package;
1297 }
1298 if (!rapl_package_register_powercap(rp)) {
1299 INIT_LIST_HEAD(&rp->plist);
1300 list_add(&rp->plist, &rapl_packages);
1301 return ret;
1302 }
1303
1304err_free_package:
1305 kfree(rp->domains);
1306 kfree(rp);
1307
1308 return ret;
1309}
1310
1311/* Handles CPU hotplug on multi-socket systems.
1312 * If a CPU goes online as the first CPU of the physical package
1313 * we add the RAPL package to the system. Similarly, when the last
1314 * CPU of the package is removed, we remove the RAPL package and its
1315 * associated domains. Cooling devices are handled accordingly at
1316 * per-domain level.
1317 */
1318static int rapl_cpu_callback(struct notifier_block *nfb,
1319 unsigned long action, void *hcpu)
1320{
1321 unsigned long cpu = (unsigned long)hcpu;
1322 int phy_package_id;
1323 struct rapl_package *rp;
1324
1325 phy_package_id = topology_physical_package_id(cpu);
1326 switch (action) {
1327 case CPU_ONLINE:
1328 case CPU_ONLINE_FROZEN:
1329 case CPU_DOWN_FAILED:
1330 case CPU_DOWN_FAILED_FROZEN:
1331 rp = find_package_by_id(phy_package_id);
1332 if (rp)
1333 ++rp->nr_cpus;
1334 else
1335 rapl_add_package(cpu);
1336 break;
1337 case CPU_DOWN_PREPARE:
1338 case CPU_DOWN_PREPARE_FROZEN:
1339 rp = find_package_by_id(phy_package_id);
1340 if (!rp)
1341 break;
1342 if (--rp->nr_cpus == 0)
1343 rapl_remove_package(rp);
1344 }
1345
1346 return NOTIFY_OK;
1347}
1348
1349static struct notifier_block rapl_cpu_notifier = {
1350 .notifier_call = rapl_cpu_callback,
1351};
1352
1353static int __init rapl_init(void)
1354{
1355 int ret = 0;
1356
1357 if (!x86_match_cpu(rapl_ids)) {
1358 pr_err("driver does not support CPU family %d model %d\n",
1359 boot_cpu_data.x86, boot_cpu_data.x86_model);
1360
1361 return -ENODEV;
1362 }
1363 /* prevent CPU hotplug during detection */
1364 get_online_cpus();
1365 ret = rapl_detect_topology();
1366 if (ret)
1367 goto done;
1368
1369 if (rapl_register_powercap()) {
1370 rapl_cleanup_data();
1371 ret = -ENODEV;
1372 goto done;
1373 }
1374 register_hotcpu_notifier(&rapl_cpu_notifier);
1375done:
1376 put_online_cpus();
1377
1378 return ret;
1379}
1380
1381static void __exit rapl_exit(void)
1382{
1383 get_online_cpus();
1384 unregister_hotcpu_notifier(&rapl_cpu_notifier);
1385 rapl_unregister_powercap();
1386 rapl_cleanup_data();
1387 put_online_cpus();
1388}
1389
1390module_init(rapl_init);
1391module_exit(rapl_exit);
1392
1393MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
1394MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1395MODULE_LICENSE("GPL v2");