aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/perf_event.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-21 06:02:48 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-21 08:28:04 -0400
commitcdd6c482c9ff9c55475ee7392ec8f672eddb7be6 (patch)
tree81f98a3ab46c589792057fe2392c1e10f8ad7893 /arch/powerpc/kernel/perf_event.c
parentdfc65094d0313cc48969fa60bcf33d693aeb05a7 (diff)
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/powerpc/kernel/perf_event.c')
-rw-r--r--arch/powerpc/kernel/perf_event.c1315
1 files changed, 1315 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
new file mode 100644
index 00000000000..c98321fcb45
--- /dev/null
+++ b/arch/powerpc/kernel/perf_event.c
@@ -0,0 +1,1315 @@
1/*
2 * Performance event support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_event.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20#include <asm/ptrace.h>
21
22struct cpu_hw_events {
23 int n_events;
24 int n_percpu;
25 int disabled;
26 int n_added;
27 int n_limited;
28 u8 pmcs_enabled;
29 struct perf_event *event[MAX_HWEVENTS];
30 u64 events[MAX_HWEVENTS];
31 unsigned int flags[MAX_HWEVENTS];
32 unsigned long mmcr[3];
33 struct perf_event *limited_event[MAX_LIMITED_HWEVENTS];
34 u8 limited_hwidx[MAX_LIMITED_HWEVENTS];
35 u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
36 unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
37 unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
38};
39DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
40
41struct power_pmu *ppmu;
42
43/*
44 * Normally, to ignore kernel events we set the FCS (freeze events
45 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
46 * hypervisor bit set in the MSR, or if we are running on a processor
47 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
48 * then we need to use the FCHV bit to ignore kernel events.
49 */
50static unsigned int freeze_events_kernel = MMCR0_FCS;
51
52/*
53 * 32-bit doesn't have MMCRA but does have an MMCR2,
54 * and a few other names are different.
55 */
56#ifdef CONFIG_PPC32
57
58#define MMCR0_FCHV 0
59#define MMCR0_PMCjCE MMCR0_PMCnCE
60
61#define SPRN_MMCRA SPRN_MMCR2
62#define MMCRA_SAMPLE_ENABLE 0
63
64static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
65{
66 return 0;
67}
68static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
69static inline u32 perf_get_misc_flags(struct pt_regs *regs)
70{
71 return 0;
72}
73static inline void perf_read_regs(struct pt_regs *regs) { }
74static inline int perf_intr_is_nmi(struct pt_regs *regs)
75{
76 return 0;
77}
78
79#endif /* CONFIG_PPC32 */
80
81/*
82 * Things that are specific to 64-bit implementations.
83 */
84#ifdef CONFIG_PPC64
85
86static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
87{
88 unsigned long mmcra = regs->dsisr;
89
90 if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
91 unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
92 if (slot > 1)
93 return 4 * (slot - 1);
94 }
95 return 0;
96}
97
98/*
99 * The user wants a data address recorded.
100 * If we're not doing instruction sampling, give them the SDAR
101 * (sampled data address). If we are doing instruction sampling, then
102 * only give them the SDAR if it corresponds to the instruction
103 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
104 * bit in MMCRA.
105 */
106static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
107{
108 unsigned long mmcra = regs->dsisr;
109 unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
110 POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
111
112 if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
113 *addrp = mfspr(SPRN_SDAR);
114}
115
116static inline u32 perf_get_misc_flags(struct pt_regs *regs)
117{
118 unsigned long mmcra = regs->dsisr;
119
120 if (TRAP(regs) != 0xf00)
121 return 0; /* not a PMU interrupt */
122
123 if (ppmu->flags & PPMU_ALT_SIPR) {
124 if (mmcra & POWER6_MMCRA_SIHV)
125 return PERF_RECORD_MISC_HYPERVISOR;
126 return (mmcra & POWER6_MMCRA_SIPR) ?
127 PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL;
128 }
129 if (mmcra & MMCRA_SIHV)
130 return PERF_RECORD_MISC_HYPERVISOR;
131 return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER :
132 PERF_RECORD_MISC_KERNEL;
133}
134
135/*
136 * Overload regs->dsisr to store MMCRA so we only need to read it once
137 * on each interrupt.
138 */
139static inline void perf_read_regs(struct pt_regs *regs)
140{
141 regs->dsisr = mfspr(SPRN_MMCRA);
142}
143
144/*
145 * If interrupts were soft-disabled when a PMU interrupt occurs, treat
146 * it as an NMI.
147 */
148static inline int perf_intr_is_nmi(struct pt_regs *regs)
149{
150 return !regs->softe;
151}
152
153#endif /* CONFIG_PPC64 */
154
155static void perf_event_interrupt(struct pt_regs *regs);
156
157void perf_event_print_debug(void)
158{
159}
160
161/*
162 * Read one performance monitor event (PMC).
163 */
164static unsigned long read_pmc(int idx)
165{
166 unsigned long val;
167
168 switch (idx) {
169 case 1:
170 val = mfspr(SPRN_PMC1);
171 break;
172 case 2:
173 val = mfspr(SPRN_PMC2);
174 break;
175 case 3:
176 val = mfspr(SPRN_PMC3);
177 break;
178 case 4:
179 val = mfspr(SPRN_PMC4);
180 break;
181 case 5:
182 val = mfspr(SPRN_PMC5);
183 break;
184 case 6:
185 val = mfspr(SPRN_PMC6);
186 break;
187#ifdef CONFIG_PPC64
188 case 7:
189 val = mfspr(SPRN_PMC7);
190 break;
191 case 8:
192 val = mfspr(SPRN_PMC8);
193 break;
194#endif /* CONFIG_PPC64 */
195 default:
196 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
197 val = 0;
198 }
199 return val;
200}
201
202/*
203 * Write one PMC.
204 */
205static void write_pmc(int idx, unsigned long val)
206{
207 switch (idx) {
208 case 1:
209 mtspr(SPRN_PMC1, val);
210 break;
211 case 2:
212 mtspr(SPRN_PMC2, val);
213 break;
214 case 3:
215 mtspr(SPRN_PMC3, val);
216 break;
217 case 4:
218 mtspr(SPRN_PMC4, val);
219 break;
220 case 5:
221 mtspr(SPRN_PMC5, val);
222 break;
223 case 6:
224 mtspr(SPRN_PMC6, val);
225 break;
226#ifdef CONFIG_PPC64
227 case 7:
228 mtspr(SPRN_PMC7, val);
229 break;
230 case 8:
231 mtspr(SPRN_PMC8, val);
232 break;
233#endif /* CONFIG_PPC64 */
234 default:
235 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
236 }
237}
238
239/*
240 * Check if a set of events can all go on the PMU at once.
241 * If they can't, this will look at alternative codes for the events
242 * and see if any combination of alternative codes is feasible.
243 * The feasible set is returned in event_id[].
244 */
245static int power_check_constraints(struct cpu_hw_events *cpuhw,
246 u64 event_id[], unsigned int cflags[],
247 int n_ev)
248{
249 unsigned long mask, value, nv;
250 unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
251 int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
252 int i, j;
253 unsigned long addf = ppmu->add_fields;
254 unsigned long tadd = ppmu->test_adder;
255
256 if (n_ev > ppmu->n_event)
257 return -1;
258
259 /* First see if the events will go on as-is */
260 for (i = 0; i < n_ev; ++i) {
261 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
262 && !ppmu->limited_pmc_event(event_id[i])) {
263 ppmu->get_alternatives(event_id[i], cflags[i],
264 cpuhw->alternatives[i]);
265 event_id[i] = cpuhw->alternatives[i][0];
266 }
267 if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
268 &cpuhw->avalues[i][0]))
269 return -1;
270 }
271 value = mask = 0;
272 for (i = 0; i < n_ev; ++i) {
273 nv = (value | cpuhw->avalues[i][0]) +
274 (value & cpuhw->avalues[i][0] & addf);
275 if ((((nv + tadd) ^ value) & mask) != 0 ||
276 (((nv + tadd) ^ cpuhw->avalues[i][0]) &
277 cpuhw->amasks[i][0]) != 0)
278 break;
279 value = nv;
280 mask |= cpuhw->amasks[i][0];
281 }
282 if (i == n_ev)
283 return 0; /* all OK */
284
285 /* doesn't work, gather alternatives... */
286 if (!ppmu->get_alternatives)
287 return -1;
288 for (i = 0; i < n_ev; ++i) {
289 choice[i] = 0;
290 n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
291 cpuhw->alternatives[i]);
292 for (j = 1; j < n_alt[i]; ++j)
293 ppmu->get_constraint(cpuhw->alternatives[i][j],
294 &cpuhw->amasks[i][j],
295 &cpuhw->avalues[i][j]);
296 }
297
298 /* enumerate all possibilities and see if any will work */
299 i = 0;
300 j = -1;
301 value = mask = nv = 0;
302 while (i < n_ev) {
303 if (j >= 0) {
304 /* we're backtracking, restore context */
305 value = svalues[i];
306 mask = smasks[i];
307 j = choice[i];
308 }
309 /*
310 * See if any alternative k for event_id i,
311 * where k > j, will satisfy the constraints.
312 */
313 while (++j < n_alt[i]) {
314 nv = (value | cpuhw->avalues[i][j]) +
315 (value & cpuhw->avalues[i][j] & addf);
316 if ((((nv + tadd) ^ value) & mask) == 0 &&
317 (((nv + tadd) ^ cpuhw->avalues[i][j])
318 & cpuhw->amasks[i][j]) == 0)
319 break;
320 }
321 if (j >= n_alt[i]) {
322 /*
323 * No feasible alternative, backtrack
324 * to event_id i-1 and continue enumerating its
325 * alternatives from where we got up to.
326 */
327 if (--i < 0)
328 return -1;
329 } else {
330 /*
331 * Found a feasible alternative for event_id i,
332 * remember where we got up to with this event_id,
333 * go on to the next event_id, and start with
334 * the first alternative for it.
335 */
336 choice[i] = j;
337 svalues[i] = value;
338 smasks[i] = mask;
339 value = nv;
340 mask |= cpuhw->amasks[i][j];
341 ++i;
342 j = -1;
343 }
344 }
345
346 /* OK, we have a feasible combination, tell the caller the solution */
347 for (i = 0; i < n_ev; ++i)
348 event_id[i] = cpuhw->alternatives[i][choice[i]];
349 return 0;
350}
351
352/*
353 * Check if newly-added events have consistent settings for
354 * exclude_{user,kernel,hv} with each other and any previously
355 * added events.
356 */
357static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
358 int n_prev, int n_new)
359{
360 int eu = 0, ek = 0, eh = 0;
361 int i, n, first;
362 struct perf_event *event;
363
364 n = n_prev + n_new;
365 if (n <= 1)
366 return 0;
367
368 first = 1;
369 for (i = 0; i < n; ++i) {
370 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
371 cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
372 continue;
373 }
374 event = ctrs[i];
375 if (first) {
376 eu = event->attr.exclude_user;
377 ek = event->attr.exclude_kernel;
378 eh = event->attr.exclude_hv;
379 first = 0;
380 } else if (event->attr.exclude_user != eu ||
381 event->attr.exclude_kernel != ek ||
382 event->attr.exclude_hv != eh) {
383 return -EAGAIN;
384 }
385 }
386
387 if (eu || ek || eh)
388 for (i = 0; i < n; ++i)
389 if (cflags[i] & PPMU_LIMITED_PMC_OK)
390 cflags[i] |= PPMU_LIMITED_PMC_REQD;
391
392 return 0;
393}
394
395static void power_pmu_read(struct perf_event *event)
396{
397 s64 val, delta, prev;
398
399 if (!event->hw.idx)
400 return;
401 /*
402 * Performance monitor interrupts come even when interrupts
403 * are soft-disabled, as long as interrupts are hard-enabled.
404 * Therefore we treat them like NMIs.
405 */
406 do {
407 prev = atomic64_read(&event->hw.prev_count);
408 barrier();
409 val = read_pmc(event->hw.idx);
410 } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
411
412 /* The events are only 32 bits wide */
413 delta = (val - prev) & 0xfffffffful;
414 atomic64_add(delta, &event->count);
415 atomic64_sub(delta, &event->hw.period_left);
416}
417
418/*
419 * On some machines, PMC5 and PMC6 can't be written, don't respect
420 * the freeze conditions, and don't generate interrupts. This tells
421 * us if `event' is using such a PMC.
422 */
423static int is_limited_pmc(int pmcnum)
424{
425 return (ppmu->flags & PPMU_LIMITED_PMC5_6)
426 && (pmcnum == 5 || pmcnum == 6);
427}
428
429static void freeze_limited_events(struct cpu_hw_events *cpuhw,
430 unsigned long pmc5, unsigned long pmc6)
431{
432 struct perf_event *event;
433 u64 val, prev, delta;
434 int i;
435
436 for (i = 0; i < cpuhw->n_limited; ++i) {
437 event = cpuhw->limited_event[i];
438 if (!event->hw.idx)
439 continue;
440 val = (event->hw.idx == 5) ? pmc5 : pmc6;
441 prev = atomic64_read(&event->hw.prev_count);
442 event->hw.idx = 0;
443 delta = (val - prev) & 0xfffffffful;
444 atomic64_add(delta, &event->count);
445 }
446}
447
448static void thaw_limited_events(struct cpu_hw_events *cpuhw,
449 unsigned long pmc5, unsigned long pmc6)
450{
451 struct perf_event *event;
452 u64 val;
453 int i;
454
455 for (i = 0; i < cpuhw->n_limited; ++i) {
456 event = cpuhw->limited_event[i];
457 event->hw.idx = cpuhw->limited_hwidx[i];
458 val = (event->hw.idx == 5) ? pmc5 : pmc6;
459 atomic64_set(&event->hw.prev_count, val);
460 perf_event_update_userpage(event);
461 }
462}
463
464/*
465 * Since limited events don't respect the freeze conditions, we
466 * have to read them immediately after freezing or unfreezing the
467 * other events. We try to keep the values from the limited
468 * events as consistent as possible by keeping the delay (in
469 * cycles and instructions) between freezing/unfreezing and reading
470 * the limited events as small and consistent as possible.
471 * Therefore, if any limited events are in use, we read them
472 * both, and always in the same order, to minimize variability,
473 * and do it inside the same asm that writes MMCR0.
474 */
475static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
476{
477 unsigned long pmc5, pmc6;
478
479 if (!cpuhw->n_limited) {
480 mtspr(SPRN_MMCR0, mmcr0);
481 return;
482 }
483
484 /*
485 * Write MMCR0, then read PMC5 and PMC6 immediately.
486 * To ensure we don't get a performance monitor interrupt
487 * between writing MMCR0 and freezing/thawing the limited
488 * events, we first write MMCR0 with the event overflow
489 * interrupt enable bits turned off.
490 */
491 asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
492 : "=&r" (pmc5), "=&r" (pmc6)
493 : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
494 "i" (SPRN_MMCR0),
495 "i" (SPRN_PMC5), "i" (SPRN_PMC6));
496
497 if (mmcr0 & MMCR0_FC)
498 freeze_limited_events(cpuhw, pmc5, pmc6);
499 else
500 thaw_limited_events(cpuhw, pmc5, pmc6);
501
502 /*
503 * Write the full MMCR0 including the event overflow interrupt
504 * enable bits, if necessary.
505 */
506 if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
507 mtspr(SPRN_MMCR0, mmcr0);
508}
509
510/*
511 * Disable all events to prevent PMU interrupts and to allow
512 * events to be added or removed.
513 */
514void hw_perf_disable(void)
515{
516 struct cpu_hw_events *cpuhw;
517 unsigned long flags;
518
519 if (!ppmu)
520 return;
521 local_irq_save(flags);
522 cpuhw = &__get_cpu_var(cpu_hw_events);
523
524 if (!cpuhw->disabled) {
525 cpuhw->disabled = 1;
526 cpuhw->n_added = 0;
527
528 /*
529 * Check if we ever enabled the PMU on this cpu.
530 */
531 if (!cpuhw->pmcs_enabled) {
532 ppc_enable_pmcs();
533 cpuhw->pmcs_enabled = 1;
534 }
535
536 /*
537 * Disable instruction sampling if it was enabled
538 */
539 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
540 mtspr(SPRN_MMCRA,
541 cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
542 mb();
543 }
544
545 /*
546 * Set the 'freeze events' bit.
547 * The barrier is to make sure the mtspr has been
548 * executed and the PMU has frozen the events
549 * before we return.
550 */
551 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
552 mb();
553 }
554 local_irq_restore(flags);
555}
556
557/*
558 * Re-enable all events if disable == 0.
559 * If we were previously disabled and events were added, then
560 * put the new config on the PMU.
561 */
562void hw_perf_enable(void)
563{
564 struct perf_event *event;
565 struct cpu_hw_events *cpuhw;
566 unsigned long flags;
567 long i;
568 unsigned long val;
569 s64 left;
570 unsigned int hwc_index[MAX_HWEVENTS];
571 int n_lim;
572 int idx;
573
574 if (!ppmu)
575 return;
576 local_irq_save(flags);
577 cpuhw = &__get_cpu_var(cpu_hw_events);
578 if (!cpuhw->disabled) {
579 local_irq_restore(flags);
580 return;
581 }
582 cpuhw->disabled = 0;
583
584 /*
585 * If we didn't change anything, or only removed events,
586 * no need to recalculate MMCR* settings and reset the PMCs.
587 * Just reenable the PMU with the current MMCR* settings
588 * (possibly updated for removal of events).
589 */
590 if (!cpuhw->n_added) {
591 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
592 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
593 if (cpuhw->n_events == 0)
594 ppc_set_pmu_inuse(0);
595 goto out_enable;
596 }
597
598 /*
599 * Compute MMCR* values for the new set of events
600 */
601 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
602 cpuhw->mmcr)) {
603 /* shouldn't ever get here */
604 printk(KERN_ERR "oops compute_mmcr failed\n");
605 goto out;
606 }
607
608 /*
609 * Add in MMCR0 freeze bits corresponding to the
610 * attr.exclude_* bits for the first event.
611 * We have already checked that all events have the
612 * same values for these bits as the first event.
613 */
614 event = cpuhw->event[0];
615 if (event->attr.exclude_user)
616 cpuhw->mmcr[0] |= MMCR0_FCP;
617 if (event->attr.exclude_kernel)
618 cpuhw->mmcr[0] |= freeze_events_kernel;
619 if (event->attr.exclude_hv)
620 cpuhw->mmcr[0] |= MMCR0_FCHV;
621
622 /*
623 * Write the new configuration to MMCR* with the freeze
624 * bit set and set the hardware events to their initial values.
625 * Then unfreeze the events.
626 */
627 ppc_set_pmu_inuse(1);
628 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
629 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
630 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
631 | MMCR0_FC);
632
633 /*
634 * Read off any pre-existing events that need to move
635 * to another PMC.
636 */
637 for (i = 0; i < cpuhw->n_events; ++i) {
638 event = cpuhw->event[i];
639 if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
640 power_pmu_read(event);
641 write_pmc(event->hw.idx, 0);
642 event->hw.idx = 0;
643 }
644 }
645
646 /*
647 * Initialize the PMCs for all the new and moved events.
648 */
649 cpuhw->n_limited = n_lim = 0;
650 for (i = 0; i < cpuhw->n_events; ++i) {
651 event = cpuhw->event[i];
652 if (event->hw.idx)
653 continue;
654 idx = hwc_index[i] + 1;
655 if (is_limited_pmc(idx)) {
656 cpuhw->limited_event[n_lim] = event;
657 cpuhw->limited_hwidx[n_lim] = idx;
658 ++n_lim;
659 continue;
660 }
661 val = 0;
662 if (event->hw.sample_period) {
663 left = atomic64_read(&event->hw.period_left);
664 if (left < 0x80000000L)
665 val = 0x80000000L - left;
666 }
667 atomic64_set(&event->hw.prev_count, val);
668 event->hw.idx = idx;
669 write_pmc(idx, val);
670 perf_event_update_userpage(event);
671 }
672 cpuhw->n_limited = n_lim;
673 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
674
675 out_enable:
676 mb();
677 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
678
679 /*
680 * Enable instruction sampling if necessary
681 */
682 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
683 mb();
684 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
685 }
686
687 out:
688 local_irq_restore(flags);
689}
690
691static int collect_events(struct perf_event *group, int max_count,
692 struct perf_event *ctrs[], u64 *events,
693 unsigned int *flags)
694{
695 int n = 0;
696 struct perf_event *event;
697
698 if (!is_software_event(group)) {
699 if (n >= max_count)
700 return -1;
701 ctrs[n] = group;
702 flags[n] = group->hw.event_base;
703 events[n++] = group->hw.config;
704 }
705 list_for_each_entry(event, &group->sibling_list, list_entry) {
706 if (!is_software_event(event) &&
707 event->state != PERF_EVENT_STATE_OFF) {
708 if (n >= max_count)
709 return -1;
710 ctrs[n] = event;
711 flags[n] = event->hw.event_base;
712 events[n++] = event->hw.config;
713 }
714 }
715 return n;
716}
717
718static void event_sched_in(struct perf_event *event, int cpu)
719{
720 event->state = PERF_EVENT_STATE_ACTIVE;
721 event->oncpu = cpu;
722 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
723 if (is_software_event(event))
724 event->pmu->enable(event);
725}
726
727/*
728 * Called to enable a whole group of events.
729 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
730 * Assumes the caller has disabled interrupts and has
731 * frozen the PMU with hw_perf_save_disable.
732 */
733int hw_perf_group_sched_in(struct perf_event *group_leader,
734 struct perf_cpu_context *cpuctx,
735 struct perf_event_context *ctx, int cpu)
736{
737 struct cpu_hw_events *cpuhw;
738 long i, n, n0;
739 struct perf_event *sub;
740
741 if (!ppmu)
742 return 0;
743 cpuhw = &__get_cpu_var(cpu_hw_events);
744 n0 = cpuhw->n_events;
745 n = collect_events(group_leader, ppmu->n_event - n0,
746 &cpuhw->event[n0], &cpuhw->events[n0],
747 &cpuhw->flags[n0]);
748 if (n < 0)
749 return -EAGAIN;
750 if (check_excludes(cpuhw->event, cpuhw->flags, n0, n))
751 return -EAGAIN;
752 i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
753 if (i < 0)
754 return -EAGAIN;
755 cpuhw->n_events = n0 + n;
756 cpuhw->n_added += n;
757
758 /*
759 * OK, this group can go on; update event states etc.,
760 * and enable any software events
761 */
762 for (i = n0; i < n0 + n; ++i)
763 cpuhw->event[i]->hw.config = cpuhw->events[i];
764 cpuctx->active_oncpu += n;
765 n = 1;
766 event_sched_in(group_leader, cpu);
767 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
768 if (sub->state != PERF_EVENT_STATE_OFF) {
769 event_sched_in(sub, cpu);
770 ++n;
771 }
772 }
773 ctx->nr_active += n;
774
775 return 1;
776}
777
778/*
779 * Add a event to the PMU.
780 * If all events are not already frozen, then we disable and
781 * re-enable the PMU in order to get hw_perf_enable to do the
782 * actual work of reconfiguring the PMU.
783 */
784static int power_pmu_enable(struct perf_event *event)
785{
786 struct cpu_hw_events *cpuhw;
787 unsigned long flags;
788 int n0;
789 int ret = -EAGAIN;
790
791 local_irq_save(flags);
792 perf_disable();
793
794 /*
795 * Add the event to the list (if there is room)
796 * and check whether the total set is still feasible.
797 */
798 cpuhw = &__get_cpu_var(cpu_hw_events);
799 n0 = cpuhw->n_events;
800 if (n0 >= ppmu->n_event)
801 goto out;
802 cpuhw->event[n0] = event;
803 cpuhw->events[n0] = event->hw.config;
804 cpuhw->flags[n0] = event->hw.event_base;
805 if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
806 goto out;
807 if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
808 goto out;
809
810 event->hw.config = cpuhw->events[n0];
811 ++cpuhw->n_events;
812 ++cpuhw->n_added;
813
814 ret = 0;
815 out:
816 perf_enable();
817 local_irq_restore(flags);
818 return ret;
819}
820
821/*
822 * Remove a event from the PMU.
823 */
824static void power_pmu_disable(struct perf_event *event)
825{
826 struct cpu_hw_events *cpuhw;
827 long i;
828 unsigned long flags;
829
830 local_irq_save(flags);
831 perf_disable();
832
833 power_pmu_read(event);
834
835 cpuhw = &__get_cpu_var(cpu_hw_events);
836 for (i = 0; i < cpuhw->n_events; ++i) {
837 if (event == cpuhw->event[i]) {
838 while (++i < cpuhw->n_events)
839 cpuhw->event[i-1] = cpuhw->event[i];
840 --cpuhw->n_events;
841 ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
842 if (event->hw.idx) {
843 write_pmc(event->hw.idx, 0);
844 event->hw.idx = 0;
845 }
846 perf_event_update_userpage(event);
847 break;
848 }
849 }
850 for (i = 0; i < cpuhw->n_limited; ++i)
851 if (event == cpuhw->limited_event[i])
852 break;
853 if (i < cpuhw->n_limited) {
854 while (++i < cpuhw->n_limited) {
855 cpuhw->limited_event[i-1] = cpuhw->limited_event[i];
856 cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
857 }
858 --cpuhw->n_limited;
859 }
860 if (cpuhw->n_events == 0) {
861 /* disable exceptions if no events are running */
862 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
863 }
864
865 perf_enable();
866 local_irq_restore(flags);
867}
868
869/*
870 * Re-enable interrupts on a event after they were throttled
871 * because they were coming too fast.
872 */
873static void power_pmu_unthrottle(struct perf_event *event)
874{
875 s64 val, left;
876 unsigned long flags;
877
878 if (!event->hw.idx || !event->hw.sample_period)
879 return;
880 local_irq_save(flags);
881 perf_disable();
882 power_pmu_read(event);
883 left = event->hw.sample_period;
884 event->hw.last_period = left;
885 val = 0;
886 if (left < 0x80000000L)
887 val = 0x80000000L - left;
888 write_pmc(event->hw.idx, val);
889 atomic64_set(&event->hw.prev_count, val);
890 atomic64_set(&event->hw.period_left, left);
891 perf_event_update_userpage(event);
892 perf_enable();
893 local_irq_restore(flags);
894}
895
896struct pmu power_pmu = {
897 .enable = power_pmu_enable,
898 .disable = power_pmu_disable,
899 .read = power_pmu_read,
900 .unthrottle = power_pmu_unthrottle,
901};
902
903/*
904 * Return 1 if we might be able to put event on a limited PMC,
905 * or 0 if not.
906 * A event can only go on a limited PMC if it counts something
907 * that a limited PMC can count, doesn't require interrupts, and
908 * doesn't exclude any processor mode.
909 */
910static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
911 unsigned int flags)
912{
913 int n;
914 u64 alt[MAX_EVENT_ALTERNATIVES];
915
916 if (event->attr.exclude_user
917 || event->attr.exclude_kernel
918 || event->attr.exclude_hv
919 || event->attr.sample_period)
920 return 0;
921
922 if (ppmu->limited_pmc_event(ev))
923 return 1;
924
925 /*
926 * The requested event_id isn't on a limited PMC already;
927 * see if any alternative code goes on a limited PMC.
928 */
929 if (!ppmu->get_alternatives)
930 return 0;
931
932 flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
933 n = ppmu->get_alternatives(ev, flags, alt);
934
935 return n > 0;
936}
937
938/*
939 * Find an alternative event_id that goes on a normal PMC, if possible,
940 * and return the event_id code, or 0 if there is no such alternative.
941 * (Note: event_id code 0 is "don't count" on all machines.)
942 */
943static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
944{
945 u64 alt[MAX_EVENT_ALTERNATIVES];
946 int n;
947
948 flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
949 n = ppmu->get_alternatives(ev, flags, alt);
950 if (!n)
951 return 0;
952 return alt[0];
953}
954
955/* Number of perf_events counting hardware events */
956static atomic_t num_events;
957/* Used to avoid races in calling reserve/release_pmc_hardware */
958static DEFINE_MUTEX(pmc_reserve_mutex);
959
960/*
961 * Release the PMU if this is the last perf_event.
962 */
963static void hw_perf_event_destroy(struct perf_event *event)
964{
965 if (!atomic_add_unless(&num_events, -1, 1)) {
966 mutex_lock(&pmc_reserve_mutex);
967 if (atomic_dec_return(&num_events) == 0)
968 release_pmc_hardware();
969 mutex_unlock(&pmc_reserve_mutex);
970 }
971}
972
973/*
974 * Translate a generic cache event_id config to a raw event_id code.
975 */
976static int hw_perf_cache_event(u64 config, u64 *eventp)
977{
978 unsigned long type, op, result;
979 int ev;
980
981 if (!ppmu->cache_events)
982 return -EINVAL;
983
984 /* unpack config */
985 type = config & 0xff;
986 op = (config >> 8) & 0xff;
987 result = (config >> 16) & 0xff;
988
989 if (type >= PERF_COUNT_HW_CACHE_MAX ||
990 op >= PERF_COUNT_HW_CACHE_OP_MAX ||
991 result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
992 return -EINVAL;
993
994 ev = (*ppmu->cache_events)[type][op][result];
995 if (ev == 0)
996 return -EOPNOTSUPP;
997 if (ev == -1)
998 return -EINVAL;
999 *eventp = ev;
1000 return 0;
1001}
1002
1003const struct pmu *hw_perf_event_init(struct perf_event *event)
1004{
1005 u64 ev;
1006 unsigned long flags;
1007 struct perf_event *ctrs[MAX_HWEVENTS];
1008 u64 events[MAX_HWEVENTS];
1009 unsigned int cflags[MAX_HWEVENTS];
1010 int n;
1011 int err;
1012 struct cpu_hw_events *cpuhw;
1013
1014 if (!ppmu)
1015 return ERR_PTR(-ENXIO);
1016 switch (event->attr.type) {
1017 case PERF_TYPE_HARDWARE:
1018 ev = event->attr.config;
1019 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1020 return ERR_PTR(-EOPNOTSUPP);
1021 ev = ppmu->generic_events[ev];
1022 break;
1023 case PERF_TYPE_HW_CACHE:
1024 err = hw_perf_cache_event(event->attr.config, &ev);
1025 if (err)
1026 return ERR_PTR(err);
1027 break;
1028 case PERF_TYPE_RAW:
1029 ev = event->attr.config;
1030 break;
1031 default:
1032 return ERR_PTR(-EINVAL);
1033 }
1034 event->hw.config_base = ev;
1035 event->hw.idx = 0;
1036
1037 /*
1038 * If we are not running on a hypervisor, force the
1039 * exclude_hv bit to 0 so that we don't care what
1040 * the user set it to.
1041 */
1042 if (!firmware_has_feature(FW_FEATURE_LPAR))
1043 event->attr.exclude_hv = 0;
1044
1045 /*
1046 * If this is a per-task event, then we can use
1047 * PM_RUN_* events interchangeably with their non RUN_*
1048 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
1049 * XXX we should check if the task is an idle task.
1050 */
1051 flags = 0;
1052 if (event->ctx->task)
1053 flags |= PPMU_ONLY_COUNT_RUN;
1054
1055 /*
1056 * If this machine has limited events, check whether this
1057 * event_id could go on a limited event.
1058 */
1059 if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1060 if (can_go_on_limited_pmc(event, ev, flags)) {
1061 flags |= PPMU_LIMITED_PMC_OK;
1062 } else if (ppmu->limited_pmc_event(ev)) {
1063 /*
1064 * The requested event_id is on a limited PMC,
1065 * but we can't use a limited PMC; see if any
1066 * alternative goes on a normal PMC.
1067 */
1068 ev = normal_pmc_alternative(ev, flags);
1069 if (!ev)
1070 return ERR_PTR(-EINVAL);
1071 }
1072 }
1073
1074 /*
1075 * If this is in a group, check if it can go on with all the
1076 * other hardware events in the group. We assume the event
1077 * hasn't been linked into its leader's sibling list at this point.
1078 */
1079 n = 0;
1080 if (event->group_leader != event) {
1081 n = collect_events(event->group_leader, ppmu->n_event - 1,
1082 ctrs, events, cflags);
1083 if (n < 0)
1084 return ERR_PTR(-EINVAL);
1085 }
1086 events[n] = ev;
1087 ctrs[n] = event;
1088 cflags[n] = flags;
1089 if (check_excludes(ctrs, cflags, n, 1))
1090 return ERR_PTR(-EINVAL);
1091
1092 cpuhw = &get_cpu_var(cpu_hw_events);
1093 err = power_check_constraints(cpuhw, events, cflags, n + 1);
1094 put_cpu_var(cpu_hw_events);
1095 if (err)
1096 return ERR_PTR(-EINVAL);
1097
1098 event->hw.config = events[n];
1099 event->hw.event_base = cflags[n];
1100 event->hw.last_period = event->hw.sample_period;
1101 atomic64_set(&event->hw.period_left, event->hw.last_period);
1102
1103 /*
1104 * See if we need to reserve the PMU.
1105 * If no events are currently in use, then we have to take a
1106 * mutex to ensure that we don't race with another task doing
1107 * reserve_pmc_hardware or release_pmc_hardware.
1108 */
1109 err = 0;
1110 if (!atomic_inc_not_zero(&num_events)) {
1111 mutex_lock(&pmc_reserve_mutex);
1112 if (atomic_read(&num_events) == 0 &&
1113 reserve_pmc_hardware(perf_event_interrupt))
1114 err = -EBUSY;
1115 else
1116 atomic_inc(&num_events);
1117 mutex_unlock(&pmc_reserve_mutex);
1118 }
1119 event->destroy = hw_perf_event_destroy;
1120
1121 if (err)
1122 return ERR_PTR(err);
1123 return &power_pmu;
1124}
1125
1126/*
1127 * A event has overflowed; update its count and record
1128 * things if requested. Note that interrupts are hard-disabled
1129 * here so there is no possibility of being interrupted.
1130 */
1131static void record_and_restart(struct perf_event *event, unsigned long val,
1132 struct pt_regs *regs, int nmi)
1133{
1134 u64 period = event->hw.sample_period;
1135 s64 prev, delta, left;
1136 int record = 0;
1137
1138 /* we don't have to worry about interrupts here */
1139 prev = atomic64_read(&event->hw.prev_count);
1140 delta = (val - prev) & 0xfffffffful;
1141 atomic64_add(delta, &event->count);
1142
1143 /*
1144 * See if the total period for this event has expired,
1145 * and update for the next period.
1146 */
1147 val = 0;
1148 left = atomic64_read(&event->hw.period_left) - delta;
1149 if (period) {
1150 if (left <= 0) {
1151 left += period;
1152 if (left <= 0)
1153 left = period;
1154 record = 1;
1155 }
1156 if (left < 0x80000000LL)
1157 val = 0x80000000LL - left;
1158 }
1159
1160 /*
1161 * Finally record data if requested.
1162 */
1163 if (record) {
1164 struct perf_sample_data data = {
1165 .addr = 0,
1166 .period = event->hw.last_period,
1167 };
1168
1169 if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1170 perf_get_data_addr(regs, &data.addr);
1171
1172 if (perf_event_overflow(event, nmi, &data, regs)) {
1173 /*
1174 * Interrupts are coming too fast - throttle them
1175 * by setting the event to 0, so it will be
1176 * at least 2^30 cycles until the next interrupt
1177 * (assuming each event counts at most 2 counts
1178 * per cycle).
1179 */
1180 val = 0;
1181 left = ~0ULL >> 1;
1182 }
1183 }
1184
1185 write_pmc(event->hw.idx, val);
1186 atomic64_set(&event->hw.prev_count, val);
1187 atomic64_set(&event->hw.period_left, left);
1188 perf_event_update_userpage(event);
1189}
1190
1191/*
1192 * Called from generic code to get the misc flags (i.e. processor mode)
1193 * for an event_id.
1194 */
1195unsigned long perf_misc_flags(struct pt_regs *regs)
1196{
1197 u32 flags = perf_get_misc_flags(regs);
1198
1199 if (flags)
1200 return flags;
1201 return user_mode(regs) ? PERF_RECORD_MISC_USER :
1202 PERF_RECORD_MISC_KERNEL;
1203}
1204
1205/*
1206 * Called from generic code to get the instruction pointer
1207 * for an event_id.
1208 */
1209unsigned long perf_instruction_pointer(struct pt_regs *regs)
1210{
1211 unsigned long ip;
1212
1213 if (TRAP(regs) != 0xf00)
1214 return regs->nip; /* not a PMU interrupt */
1215
1216 ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1217 return ip;
1218}
1219
1220/*
1221 * Performance monitor interrupt stuff
1222 */
1223static void perf_event_interrupt(struct pt_regs *regs)
1224{
1225 int i;
1226 struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1227 struct perf_event *event;
1228 unsigned long val;
1229 int found = 0;
1230 int nmi;
1231
1232 if (cpuhw->n_limited)
1233 freeze_limited_events(cpuhw, mfspr(SPRN_PMC5),
1234 mfspr(SPRN_PMC6));
1235
1236 perf_read_regs(regs);
1237
1238 nmi = perf_intr_is_nmi(regs);
1239 if (nmi)
1240 nmi_enter();
1241 else
1242 irq_enter();
1243
1244 for (i = 0; i < cpuhw->n_events; ++i) {
1245 event = cpuhw->event[i];
1246 if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1247 continue;
1248 val = read_pmc(event->hw.idx);
1249 if ((int)val < 0) {
1250 /* event has overflowed */
1251 found = 1;
1252 record_and_restart(event, val, regs, nmi);
1253 }
1254 }
1255
1256 /*
1257 * In case we didn't find and reset the event that caused
1258 * the interrupt, scan all events and reset any that are
1259 * negative, to avoid getting continual interrupts.
1260 * Any that we processed in the previous loop will not be negative.
1261 */
1262 if (!found) {
1263 for (i = 0; i < ppmu->n_event; ++i) {
1264 if (is_limited_pmc(i + 1))
1265 continue;
1266 val = read_pmc(i + 1);
1267 if ((int)val < 0)
1268 write_pmc(i + 1, 0);
1269 }
1270 }
1271
1272 /*
1273 * Reset MMCR0 to its normal value. This will set PMXE and
1274 * clear FC (freeze events) and PMAO (perf mon alert occurred)
1275 * and thus allow interrupts to occur again.
1276 * XXX might want to use MSR.PM to keep the events frozen until
1277 * we get back out of this interrupt.
1278 */
1279 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1280
1281 if (nmi)
1282 nmi_exit();
1283 else
1284 irq_exit();
1285}
1286
1287void hw_perf_event_setup(int cpu)
1288{
1289 struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1290
1291 if (!ppmu)
1292 return;
1293 memset(cpuhw, 0, sizeof(*cpuhw));
1294 cpuhw->mmcr[0] = MMCR0_FC;
1295}
1296
1297int register_power_pmu(struct power_pmu *pmu)
1298{
1299 if (ppmu)
1300 return -EBUSY; /* something's already registered */
1301
1302 ppmu = pmu;
1303 pr_info("%s performance monitor hardware support registered\n",
1304 pmu->name);
1305
1306#ifdef MSR_HV
1307 /*
1308 * Use FCHV to ignore kernel events if MSR.HV is set.
1309 */
1310 if (mfmsr() & MSR_HV)
1311 freeze_events_kernel = MMCR0_FCHV;
1312#endif /* CONFIG_PPC64 */
1313
1314 return 0;
1315}