diff options
author | Stephane Eranian <eranian@google.com> | 2013-11-12 11:58:50 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-11-27 05:16:40 -0500 |
commit | 4788e5b4b2338f85fa42a712a182d8afd65d7c58 (patch) | |
tree | 0cd71b4a6154af51e71104371f0d733501fd2c7b | |
parent | 410136f5dd96b6013fe6d1011b523b1c247e1ccb (diff) |
perf/x86: Add Intel RAPL PMU support
This patch adds a new uncore PMU to expose the Intel
RAPL energy consumption counters. Up to 3 counters,
each counting a particular RAPL event are exposed.
The RAPL counters are available on Intel SandyBridge,
IvyBridge, Haswell. The server skus add a 3rd counter.
The following events are available and exposed in sysfs:
- power/energy-cores: power consumption of all cores on socket
- power/energy-pkg: power consumption of all cores + LLc cache
- power/energy-dram: power consumption of DRAM (servers only)
For each event both the unit (Joules) and scale (2^-32 J)
is exposed in sysfs for use by perf stat and other tools.
The files are:
/sys/devices/power/events/energy-*.unit
/sys/devices/power/events/energy-*.scale
The RAPL PMU is uncore by nature and is implemented such
that it only works in system-wide mode. Measuring only
one CPU per socket is sufficient. The /sys/devices/power/cpumask
file can be used by tools to figure out which CPUs to monitor
by default. For instance, on a 2-socket system, 2 CPUs
(one on each socket) will be shown.
All the counters measure in the same unit (exposed via sysfs).
The perf_events API exposes all RAPL counters as 64-bit integers
counting in unit of 1/2^32 Joules (about 0.23 nJ). User level tools
must convert the counts by multiplying them by 2^-32 to obtain
Joules. The reason for this is that the kernel avoids
doing floating point math whenever possible because it is
expensive (user floating-point state must be saved). The method
used avoids kernel floating-point usage. There is no loss of
precision. Thanks to PeterZ for suggesting this approach.
To convert the raw count in Watt:
W = C * 2.3 / (1e10 * time)
or ldexp(C, -32).
RAPL PMU is a new standalone PMU which registers with the
perf_event core subsystem. The PMU type (attr->type) is
dynamically allocated and is available from /sys/device/power/type.
Sampling is not supported by the RAPL PMU. There is no
privilege level filtering either.
Signed-off-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/1384275531-10892-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/x86/kernel/cpu/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_rapl.c | 591 |
2 files changed, 592 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99cb..6359506a19ee 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o | |||
36 | endif | 36 | endif |
37 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o | 37 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o |
38 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o | 38 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o |
39 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o | 39 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o |
40 | endif | 40 | endif |
41 | 41 | ||
42 | 42 | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 000000000000..cfcd386b5d89 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters | ||
3 | * Copyright (C) 2013 Google, Inc., Stephane Eranian | ||
4 | * | ||
5 | * Intel RAPL interface is specified in the IA-32 Manual Vol3b | ||
6 | * section 14.7.1 (September 2013) | ||
7 | * | ||
8 | * RAPL provides more controls than just reporting energy consumption | ||
9 | * however here we only expose the 3 energy consumption free running | ||
10 | * counters (pp0, pkg, dram). | ||
11 | * | ||
12 | * Each of those counters increments in a power unit defined by the | ||
13 | * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules | ||
14 | * but it can vary. | ||
15 | * | ||
16 | * Counter to rapl events mappings: | ||
17 | * | ||
18 | * pp0 counter: consumption of all physical cores (power plane 0) | ||
19 | * event: rapl_energy_cores | ||
20 | * perf code: 0x1 | ||
21 | * | ||
22 | * pkg counter: consumption of the whole processor package | ||
23 | * event: rapl_energy_pkg | ||
24 | * perf code: 0x2 | ||
25 | * | ||
26 | * dram counter: consumption of the dram domain (servers only) | ||
27 | * event: rapl_energy_dram | ||
28 | * perf code: 0x3 | ||
29 | * | ||
30 | * We manage those counters as free running (read-only). They may be | ||
31 | * use simultaneously by other tools, such as turbostat. | ||
32 | * | ||
33 | * The events only support system-wide mode counting. There is no | ||
34 | * sampling support because it does not make sense and is not | ||
35 | * supported by the RAPL hardware. | ||
36 | * | ||
37 | * Because we want to avoid floating-point operations in the kernel, | ||
38 | * the events are all reported in fixed point arithmetic (32.32). | ||
39 | * Tools must adjust the counts to convert them to Watts using | ||
40 | * the duration of the measurement. Tools may use a function such as | ||
41 | * ldexp(raw_count, -32); | ||
42 | */ | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/slab.h> | ||
45 | #include <linux/perf_event.h> | ||
46 | #include <asm/cpu_device_id.h> | ||
47 | #include "perf_event.h" | ||
48 | |||
49 | /* | ||
50 | * RAPL energy status counters | ||
51 | */ | ||
52 | #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ | ||
53 | #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ | ||
54 | #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ | ||
55 | #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ | ||
56 | #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ | ||
57 | #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ | ||
58 | |||
59 | /* Clients have PP0, PKG */ | ||
60 | #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ | ||
61 | 1<<RAPL_IDX_PKG_NRG_STAT) | ||
62 | |||
63 | /* Servers have PP0, PKG, RAM */ | ||
64 | #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ | ||
65 | 1<<RAPL_IDX_PKG_NRG_STAT|\ | ||
66 | 1<<RAPL_IDX_RAM_NRG_STAT) | ||
67 | |||
68 | /* | ||
69 | * event code: LSB 8 bits, passed in attr->config | ||
70 | * any other bit is reserved | ||
71 | */ | ||
72 | #define RAPL_EVENT_MASK 0xFFULL | ||
73 | |||
74 | #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ | ||
75 | static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ | ||
76 | struct kobj_attribute *attr, \ | ||
77 | char *page) \ | ||
78 | { \ | ||
79 | BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ | ||
80 | return sprintf(page, _format "\n"); \ | ||
81 | } \ | ||
82 | static struct kobj_attribute format_attr_##_var = \ | ||
83 | __ATTR(_name, 0444, __rapl_##_var##_show, NULL) | ||
84 | |||
85 | #define RAPL_EVENT_DESC(_name, _config) \ | ||
86 | { \ | ||
87 | .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ | ||
88 | .config = _config, \ | ||
89 | } | ||
90 | |||
91 | #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ | ||
92 | |||
93 | struct rapl_pmu { | ||
94 | spinlock_t lock; | ||
95 | int hw_unit; /* 1/2^hw_unit Joule */ | ||
96 | int n_active; /* number of active events */ | ||
97 | struct list_head active_list; | ||
98 | struct pmu *pmu; /* pointer to rapl_pmu_class */ | ||
99 | }; | ||
100 | |||
101 | static struct pmu rapl_pmu_class; | ||
102 | static cpumask_t rapl_cpu_mask; | ||
103 | static int rapl_cntr_mask; | ||
104 | |||
105 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); | ||
106 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); | ||
107 | |||
108 | static inline u64 rapl_read_counter(struct perf_event *event) | ||
109 | { | ||
110 | u64 raw; | ||
111 | rdmsrl(event->hw.event_base, raw); | ||
112 | return raw; | ||
113 | } | ||
114 | |||
115 | static inline u64 rapl_scale(u64 v) | ||
116 | { | ||
117 | /* | ||
118 | * scale delta to smallest unit (1/2^32) | ||
119 | * users must then scale back: count * 1/(1e9*2^32) to get Joules | ||
120 | * or use ldexp(count, -32). | ||
121 | * Watts = Joules/Time delta | ||
122 | */ | ||
123 | return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); | ||
124 | } | ||
125 | |||
126 | static u64 rapl_event_update(struct perf_event *event) | ||
127 | { | ||
128 | struct hw_perf_event *hwc = &event->hw; | ||
129 | u64 prev_raw_count, new_raw_count; | ||
130 | s64 delta, sdelta; | ||
131 | int shift = RAPL_CNTR_WIDTH; | ||
132 | |||
133 | again: | ||
134 | prev_raw_count = local64_read(&hwc->prev_count); | ||
135 | rdmsrl(event->hw.event_base, new_raw_count); | ||
136 | |||
137 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, | ||
138 | new_raw_count) != prev_raw_count) { | ||
139 | cpu_relax(); | ||
140 | goto again; | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Now we have the new raw value and have updated the prev | ||
145 | * timestamp already. We can now calculate the elapsed delta | ||
146 | * (event-)time and add that to the generic event. | ||
147 | * | ||
148 | * Careful, not all hw sign-extends above the physical width | ||
149 | * of the count. | ||
150 | */ | ||
151 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | ||
152 | delta >>= shift; | ||
153 | |||
154 | sdelta = rapl_scale(delta); | ||
155 | |||
156 | local64_add(sdelta, &event->count); | ||
157 | |||
158 | return new_raw_count; | ||
159 | } | ||
160 | |||
161 | static void __rapl_pmu_event_start(struct rapl_pmu *pmu, | ||
162 | struct perf_event *event) | ||
163 | { | ||
164 | if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) | ||
165 | return; | ||
166 | |||
167 | event->hw.state = 0; | ||
168 | |||
169 | list_add_tail(&event->active_entry, &pmu->active_list); | ||
170 | |||
171 | local64_set(&event->hw.prev_count, rapl_read_counter(event)); | ||
172 | |||
173 | pmu->n_active++; | ||
174 | } | ||
175 | |||
176 | static void rapl_pmu_event_start(struct perf_event *event, int mode) | ||
177 | { | ||
178 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
179 | unsigned long flags; | ||
180 | |||
181 | spin_lock_irqsave(&pmu->lock, flags); | ||
182 | __rapl_pmu_event_start(pmu, event); | ||
183 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
184 | } | ||
185 | |||
186 | static void rapl_pmu_event_stop(struct perf_event *event, int mode) | ||
187 | { | ||
188 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
189 | struct hw_perf_event *hwc = &event->hw; | ||
190 | unsigned long flags; | ||
191 | |||
192 | spin_lock_irqsave(&pmu->lock, flags); | ||
193 | |||
194 | /* mark event as deactivated and stopped */ | ||
195 | if (!(hwc->state & PERF_HES_STOPPED)) { | ||
196 | WARN_ON_ONCE(pmu->n_active <= 0); | ||
197 | pmu->n_active--; | ||
198 | |||
199 | list_del(&event->active_entry); | ||
200 | |||
201 | WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); | ||
202 | hwc->state |= PERF_HES_STOPPED; | ||
203 | } | ||
204 | |||
205 | /* check if update of sw counter is necessary */ | ||
206 | if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { | ||
207 | /* | ||
208 | * Drain the remaining delta count out of a event | ||
209 | * that we are disabling: | ||
210 | */ | ||
211 | rapl_event_update(event); | ||
212 | hwc->state |= PERF_HES_UPTODATE; | ||
213 | } | ||
214 | |||
215 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
216 | } | ||
217 | |||
218 | static int rapl_pmu_event_add(struct perf_event *event, int mode) | ||
219 | { | ||
220 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
221 | struct hw_perf_event *hwc = &event->hw; | ||
222 | unsigned long flags; | ||
223 | |||
224 | spin_lock_irqsave(&pmu->lock, flags); | ||
225 | |||
226 | hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | ||
227 | |||
228 | if (mode & PERF_EF_START) | ||
229 | __rapl_pmu_event_start(pmu, event); | ||
230 | |||
231 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
232 | |||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static void rapl_pmu_event_del(struct perf_event *event, int flags) | ||
237 | { | ||
238 | rapl_pmu_event_stop(event, PERF_EF_UPDATE); | ||
239 | } | ||
240 | |||
241 | static int rapl_pmu_event_init(struct perf_event *event) | ||
242 | { | ||
243 | u64 cfg = event->attr.config & RAPL_EVENT_MASK; | ||
244 | int bit, msr, ret = 0; | ||
245 | |||
246 | /* only look at RAPL events */ | ||
247 | if (event->attr.type != rapl_pmu_class.type) | ||
248 | return -ENOENT; | ||
249 | |||
250 | /* check only supported bits are set */ | ||
251 | if (event->attr.config & ~RAPL_EVENT_MASK) | ||
252 | return -EINVAL; | ||
253 | |||
254 | /* | ||
255 | * check event is known (determines counter) | ||
256 | */ | ||
257 | switch (cfg) { | ||
258 | case INTEL_RAPL_PP0: | ||
259 | bit = RAPL_IDX_PP0_NRG_STAT; | ||
260 | msr = MSR_PP0_ENERGY_STATUS; | ||
261 | break; | ||
262 | case INTEL_RAPL_PKG: | ||
263 | bit = RAPL_IDX_PKG_NRG_STAT; | ||
264 | msr = MSR_PKG_ENERGY_STATUS; | ||
265 | break; | ||
266 | case INTEL_RAPL_RAM: | ||
267 | bit = RAPL_IDX_RAM_NRG_STAT; | ||
268 | msr = MSR_DRAM_ENERGY_STATUS; | ||
269 | break; | ||
270 | default: | ||
271 | return -EINVAL; | ||
272 | } | ||
273 | /* check event supported */ | ||
274 | if (!(rapl_cntr_mask & (1 << bit))) | ||
275 | return -EINVAL; | ||
276 | |||
277 | /* unsupported modes and filters */ | ||
278 | if (event->attr.exclude_user || | ||
279 | event->attr.exclude_kernel || | ||
280 | event->attr.exclude_hv || | ||
281 | event->attr.exclude_idle || | ||
282 | event->attr.exclude_host || | ||
283 | event->attr.exclude_guest || | ||
284 | event->attr.sample_period) /* no sampling */ | ||
285 | return -EINVAL; | ||
286 | |||
287 | /* must be done before validate_group */ | ||
288 | event->hw.event_base = msr; | ||
289 | event->hw.config = cfg; | ||
290 | event->hw.idx = bit; | ||
291 | |||
292 | return ret; | ||
293 | } | ||
294 | |||
295 | static void rapl_pmu_event_read(struct perf_event *event) | ||
296 | { | ||
297 | rapl_event_update(event); | ||
298 | } | ||
299 | |||
300 | static ssize_t rapl_get_attr_cpumask(struct device *dev, | ||
301 | struct device_attribute *attr, char *buf) | ||
302 | { | ||
303 | int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); | ||
304 | |||
305 | buf[n++] = '\n'; | ||
306 | buf[n] = '\0'; | ||
307 | return n; | ||
308 | } | ||
309 | |||
310 | static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); | ||
311 | |||
312 | static struct attribute *rapl_pmu_attrs[] = { | ||
313 | &dev_attr_cpumask.attr, | ||
314 | NULL, | ||
315 | }; | ||
316 | |||
317 | static struct attribute_group rapl_pmu_attr_group = { | ||
318 | .attrs = rapl_pmu_attrs, | ||
319 | }; | ||
320 | |||
321 | EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); | ||
322 | EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); | ||
323 | EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); | ||
324 | |||
325 | EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); | ||
326 | EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); | ||
327 | EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); | ||
328 | |||
329 | /* | ||
330 | * we compute in 0.23 nJ increments regardless of MSR | ||
331 | */ | ||
332 | EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); | ||
333 | EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); | ||
334 | EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); | ||
335 | |||
336 | static struct attribute *rapl_events_srv_attr[] = { | ||
337 | EVENT_PTR(rapl_cores), | ||
338 | EVENT_PTR(rapl_pkg), | ||
339 | EVENT_PTR(rapl_ram), | ||
340 | |||
341 | EVENT_PTR(rapl_cores_unit), | ||
342 | EVENT_PTR(rapl_pkg_unit), | ||
343 | EVENT_PTR(rapl_ram_unit), | ||
344 | |||
345 | EVENT_PTR(rapl_cores_scale), | ||
346 | EVENT_PTR(rapl_pkg_scale), | ||
347 | EVENT_PTR(rapl_ram_scale), | ||
348 | NULL, | ||
349 | }; | ||
350 | |||
351 | static struct attribute *rapl_events_cln_attr[] = { | ||
352 | EVENT_PTR(rapl_cores), | ||
353 | EVENT_PTR(rapl_pkg), | ||
354 | |||
355 | EVENT_PTR(rapl_cores_unit), | ||
356 | EVENT_PTR(rapl_pkg_unit), | ||
357 | |||
358 | EVENT_PTR(rapl_cores_scale), | ||
359 | EVENT_PTR(rapl_pkg_scale), | ||
360 | NULL, | ||
361 | }; | ||
362 | |||
363 | static struct attribute_group rapl_pmu_events_group = { | ||
364 | .name = "events", | ||
365 | .attrs = NULL, /* patched at runtime */ | ||
366 | }; | ||
367 | |||
368 | DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); | ||
369 | static struct attribute *rapl_formats_attr[] = { | ||
370 | &format_attr_event.attr, | ||
371 | NULL, | ||
372 | }; | ||
373 | |||
374 | static struct attribute_group rapl_pmu_format_group = { | ||
375 | .name = "format", | ||
376 | .attrs = rapl_formats_attr, | ||
377 | }; | ||
378 | |||
379 | const struct attribute_group *rapl_attr_groups[] = { | ||
380 | &rapl_pmu_attr_group, | ||
381 | &rapl_pmu_format_group, | ||
382 | &rapl_pmu_events_group, | ||
383 | NULL, | ||
384 | }; | ||
385 | |||
386 | static struct pmu rapl_pmu_class = { | ||
387 | .attr_groups = rapl_attr_groups, | ||
388 | .task_ctx_nr = perf_invalid_context, /* system-wide only */ | ||
389 | .event_init = rapl_pmu_event_init, | ||
390 | .add = rapl_pmu_event_add, /* must have */ | ||
391 | .del = rapl_pmu_event_del, /* must have */ | ||
392 | .start = rapl_pmu_event_start, | ||
393 | .stop = rapl_pmu_event_stop, | ||
394 | .read = rapl_pmu_event_read, | ||
395 | }; | ||
396 | |||
397 | static void rapl_cpu_exit(int cpu) | ||
398 | { | ||
399 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
400 | int i, phys_id = topology_physical_package_id(cpu); | ||
401 | int target = -1; | ||
402 | |||
403 | /* find a new cpu on same package */ | ||
404 | for_each_online_cpu(i) { | ||
405 | if (i == cpu) | ||
406 | continue; | ||
407 | if (phys_id == topology_physical_package_id(i)) { | ||
408 | target = i; | ||
409 | break; | ||
410 | } | ||
411 | } | ||
412 | /* | ||
413 | * clear cpu from cpumask | ||
414 | * if was set in cpumask and still some cpu on package, | ||
415 | * then move to new cpu | ||
416 | */ | ||
417 | if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) | ||
418 | cpumask_set_cpu(target, &rapl_cpu_mask); | ||
419 | |||
420 | WARN_ON(cpumask_empty(&rapl_cpu_mask)); | ||
421 | /* | ||
422 | * migrate events and context to new cpu | ||
423 | */ | ||
424 | if (target >= 0) | ||
425 | perf_pmu_migrate_context(pmu->pmu, cpu, target); | ||
426 | } | ||
427 | |||
428 | static void rapl_cpu_init(int cpu) | ||
429 | { | ||
430 | int i, phys_id = topology_physical_package_id(cpu); | ||
431 | |||
432 | /* check if phys_is is already covered */ | ||
433 | for_each_cpu(i, &rapl_cpu_mask) { | ||
434 | if (phys_id == topology_physical_package_id(i)) | ||
435 | return; | ||
436 | } | ||
437 | /* was not found, so add it */ | ||
438 | cpumask_set_cpu(cpu, &rapl_cpu_mask); | ||
439 | } | ||
440 | |||
441 | static int rapl_cpu_prepare(int cpu) | ||
442 | { | ||
443 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
444 | int phys_id = topology_physical_package_id(cpu); | ||
445 | |||
446 | if (pmu) | ||
447 | return 0; | ||
448 | |||
449 | if (phys_id < 0) | ||
450 | return -1; | ||
451 | |||
452 | pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); | ||
453 | if (!pmu) | ||
454 | return -1; | ||
455 | |||
456 | spin_lock_init(&pmu->lock); | ||
457 | |||
458 | INIT_LIST_HEAD(&pmu->active_list); | ||
459 | |||
460 | /* | ||
461 | * grab power unit as: 1/2^unit Joules | ||
462 | * | ||
463 | * we cache in local PMU instance | ||
464 | */ | ||
465 | rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); | ||
466 | pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; | ||
467 | pmu->pmu = &rapl_pmu_class; | ||
468 | |||
469 | /* set RAPL pmu for this cpu for now */ | ||
470 | per_cpu(rapl_pmu, cpu) = pmu; | ||
471 | per_cpu(rapl_pmu_to_free, cpu) = NULL; | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
476 | static void rapl_cpu_kfree(int cpu) | ||
477 | { | ||
478 | struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); | ||
479 | |||
480 | kfree(pmu); | ||
481 | |||
482 | per_cpu(rapl_pmu_to_free, cpu) = NULL; | ||
483 | } | ||
484 | |||
485 | static int rapl_cpu_dying(int cpu) | ||
486 | { | ||
487 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
488 | |||
489 | if (!pmu) | ||
490 | return 0; | ||
491 | |||
492 | per_cpu(rapl_pmu, cpu) = NULL; | ||
493 | |||
494 | per_cpu(rapl_pmu_to_free, cpu) = pmu; | ||
495 | |||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | static int rapl_cpu_notifier(struct notifier_block *self, | ||
500 | unsigned long action, void *hcpu) | ||
501 | { | ||
502 | unsigned int cpu = (long)hcpu; | ||
503 | |||
504 | switch (action & ~CPU_TASKS_FROZEN) { | ||
505 | case CPU_UP_PREPARE: | ||
506 | rapl_cpu_prepare(cpu); | ||
507 | break; | ||
508 | case CPU_STARTING: | ||
509 | rapl_cpu_init(cpu); | ||
510 | break; | ||
511 | case CPU_UP_CANCELED: | ||
512 | case CPU_DYING: | ||
513 | rapl_cpu_dying(cpu); | ||
514 | break; | ||
515 | case CPU_ONLINE: | ||
516 | case CPU_DEAD: | ||
517 | rapl_cpu_kfree(cpu); | ||
518 | break; | ||
519 | case CPU_DOWN_PREPARE: | ||
520 | rapl_cpu_exit(cpu); | ||
521 | break; | ||
522 | default: | ||
523 | break; | ||
524 | } | ||
525 | |||
526 | return NOTIFY_OK; | ||
527 | } | ||
528 | |||
529 | static const struct x86_cpu_id rapl_cpu_match[] = { | ||
530 | [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, | ||
531 | [1] = {}, | ||
532 | }; | ||
533 | |||
534 | static int __init rapl_pmu_init(void) | ||
535 | { | ||
536 | struct rapl_pmu *pmu; | ||
537 | int cpu, ret; | ||
538 | |||
539 | /* | ||
540 | * check for Intel processor family 6 | ||
541 | */ | ||
542 | if (!x86_match_cpu(rapl_cpu_match)) | ||
543 | return 0; | ||
544 | |||
545 | /* check supported CPU */ | ||
546 | switch (boot_cpu_data.x86_model) { | ||
547 | case 42: /* Sandy Bridge */ | ||
548 | case 58: /* Ivy Bridge */ | ||
549 | case 60: /* Haswell */ | ||
550 | rapl_cntr_mask = RAPL_IDX_CLN; | ||
551 | rapl_pmu_events_group.attrs = rapl_events_cln_attr; | ||
552 | break; | ||
553 | case 45: /* Sandy Bridge-EP */ | ||
554 | case 62: /* IvyTown */ | ||
555 | rapl_cntr_mask = RAPL_IDX_SRV; | ||
556 | rapl_pmu_events_group.attrs = rapl_events_srv_attr; | ||
557 | break; | ||
558 | |||
559 | default: | ||
560 | /* unsupported */ | ||
561 | return 0; | ||
562 | } | ||
563 | get_online_cpus(); | ||
564 | |||
565 | for_each_online_cpu(cpu) { | ||
566 | rapl_cpu_prepare(cpu); | ||
567 | rapl_cpu_init(cpu); | ||
568 | } | ||
569 | |||
570 | perf_cpu_notifier(rapl_cpu_notifier); | ||
571 | |||
572 | ret = perf_pmu_register(&rapl_pmu_class, "power", -1); | ||
573 | if (WARN_ON(ret)) { | ||
574 | pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); | ||
575 | put_online_cpus(); | ||
576 | return -1; | ||
577 | } | ||
578 | |||
579 | pmu = __get_cpu_var(rapl_pmu); | ||
580 | |||
581 | pr_info("RAPL PMU detected, hw unit 2^-%d Joules," | ||
582 | " API unit is 2^-32 Joules," | ||
583 | " %d fixed counters\n", | ||
584 | pmu->hw_unit, | ||
585 | hweight32(rapl_cntr_mask)); | ||
586 | |||
587 | put_online_cpus(); | ||
588 | |||
589 | return 0; | ||
590 | } | ||
591 | device_initcall(rapl_pmu_init); | ||