aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2012-02-09 17:20:58 -0500
committerIngo Molnar <mingo@elte.hu>2012-03-05 08:55:42 -0500
commit3e702ff6d1ea12dcf1c798ecb61e7f3a1579df42 (patch)
treeaa9afc6fcda680d0048a20203b76aede9a16de51 /arch/x86
parent60ce0fbd072695866cb27b729690ab59dce705a5 (diff)
perf/x86: Add LBR software filter support for Intel CPUs
This patch adds an internal sofware filter to complement the (optional) LBR hardware filter. The software filter is necessary: - as a substitute when there is no HW LBR filter (e.g., Atom, Core) - to complement HW LBR filter in case of errata (e.g., Nehalem/Westmere) - to provide finer grain filtering (e.g., all processors) Sometimes the LBR HW filter cannot distinguish between two types of branches. For instance, to capture syscall as CALLS, it is necessary to enable the LBR_FAR filter which will also capture JMP instructions. Thus, a second pass is necessary to filter those out, this is what the SW filter can do. The SW filter is built on top of the internal x86 disassembler. It is a best effort filter especially for user level code. It is subject to the availability of the text page of the program. The SW filter is enabled on all Intel processors. It is bypassed when the user is capturing all branches at all priv levels. Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1328826068-11713-9-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/cpu/perf_event.h10
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c332
3 files changed, 321 insertions, 33 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ef7419cbd13d..f104c054dc5c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -132,6 +132,7 @@ struct cpu_hw_events {
132 struct perf_branch_stack lbr_stack; 132 struct perf_branch_stack lbr_stack;
133 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 133 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
134 struct er_account *lbr_sel; 134 struct er_account *lbr_sel;
135 u64 br_sel;
135 136
136 /* 137 /*
137 * Intel host/guest exclude bits 138 * Intel host/guest exclude bits
@@ -459,6 +460,15 @@ extern struct event_constraint emptyconstraint;
459 460
460extern struct event_constraint unconstrained; 461extern struct event_constraint unconstrained;
461 462
463static inline bool kernel_ip(unsigned long ip)
464{
465#ifdef CONFIG_X86_32
466 return ip > PAGE_OFFSET;
467#else
468 return (long)ip < 0;
469#endif
470}
471
462#ifdef CONFIG_CPU_SUP_AMD 472#ifdef CONFIG_CPU_SUP_AMD
463 473
464int amd_pmu_init(void); 474int amd_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ee7e3c8d9d6a..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5#include <asm/perf_event.h> 5#include <asm/perf_event.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -469,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
469 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 470 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
470} 471}
471 472
472#include <asm/insn.h>
473
474static inline bool kernel_ip(unsigned long ip)
475{
476#ifdef CONFIG_X86_32
477 return ip > PAGE_OFFSET;
478#else
479 return (long)ip < 0;
480#endif
481}
482
483static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) 473static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
484{ 474{
485 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 475 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d0fb864ff2b0..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
3 3
4#include <asm/perf_event.h> 4#include <asm/perf_event.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -61,6 +62,53 @@ enum {
61 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) 62 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
62 63
63/* 64/*
65 * x86control flow change classification
66 * x86control flow changes include branches, interrupts, traps, faults
67 */
68enum {
69 X86_BR_NONE = 0, /* unknown */
70
71 X86_BR_USER = 1 << 0, /* branch target is user */
72 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
73
74 X86_BR_CALL = 1 << 2, /* call */
75 X86_BR_RET = 1 << 3, /* return */
76 X86_BR_SYSCALL = 1 << 4, /* syscall */
77 X86_BR_SYSRET = 1 << 5, /* syscall return */
78 X86_BR_INT = 1 << 6, /* sw interrupt */
79 X86_BR_IRET = 1 << 7, /* return from interrupt */
80 X86_BR_JCC = 1 << 8, /* conditional */
81 X86_BR_JMP = 1 << 9, /* jump */
82 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
83 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
84};
85
86#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
87
88#define X86_BR_ANY \
89 (X86_BR_CALL |\
90 X86_BR_RET |\
91 X86_BR_SYSCALL |\
92 X86_BR_SYSRET |\
93 X86_BR_INT |\
94 X86_BR_IRET |\
95 X86_BR_JCC |\
96 X86_BR_JMP |\
97 X86_BR_IRQ |\
98 X86_BR_IND_CALL)
99
100#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
101
102#define X86_BR_ANY_CALL \
103 (X86_BR_CALL |\
104 X86_BR_IND_CALL |\
105 X86_BR_SYSCALL |\
106 X86_BR_IRQ |\
107 X86_BR_INT)
108
109static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
110
111/*
64 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 112 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
65 * otherwise it becomes near impossible to get a reliable stack. 113 * otherwise it becomes near impossible to get a reliable stack.
66 */ 114 */
@@ -131,6 +179,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
131 intel_pmu_lbr_reset(); 179 intel_pmu_lbr_reset();
132 cpuc->lbr_context = event->ctx; 180 cpuc->lbr_context = event->ctx;
133 } 181 }
182 cpuc->br_sel = event->hw.branch_reg.reg;
134 183
135 cpuc->lbr_users++; 184 cpuc->lbr_users++;
136} 185}
@@ -252,6 +301,44 @@ void intel_pmu_lbr_read(void)
252 intel_pmu_lbr_read_32(cpuc); 301 intel_pmu_lbr_read_32(cpuc);
253 else 302 else
254 intel_pmu_lbr_read_64(cpuc); 303 intel_pmu_lbr_read_64(cpuc);
304
305 intel_pmu_lbr_filter(cpuc);
306}
307
308/*
309 * SW filter is used:
310 * - in case there is no HW filter
311 * - in case the HW filter has errata or limitations
312 */
313static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
314{
315 u64 br_type = event->attr.branch_sample_type;
316 int mask = 0;
317
318 if (br_type & PERF_SAMPLE_BRANCH_USER)
319 mask |= X86_BR_USER;
320
321 if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
322 mask |= X86_BR_KERNEL;
323
324 /* we ignore BRANCH_HV here */
325
326 if (br_type & PERF_SAMPLE_BRANCH_ANY)
327 mask |= X86_BR_ANY;
328
329 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
330 mask |= X86_BR_ANY_CALL;
331
332 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
333 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
334
335 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
336 mask |= X86_BR_IND_CALL;
337 /*
338 * stash actual user request into reg, it may
339 * be used by fixup code for some CPU
340 */
341 event->hw.branch_reg.reg = mask;
255} 342}
256 343
257/* 344/*
@@ -273,10 +360,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
273 v = x86_pmu.lbr_sel_map[m]; 360 v = x86_pmu.lbr_sel_map[m];
274 if (v == LBR_NOT_SUPP) 361 if (v == LBR_NOT_SUPP)
275 return -EOPNOTSUPP; 362 return -EOPNOTSUPP;
276 mask |= v;
277 363
278 if (m == PERF_SAMPLE_BRANCH_ANY) 364 if (v != LBR_IGN)
279 break; 365 mask |= v;
280 } 366 }
281 reg = &event->hw.branch_reg; 367 reg = &event->hw.branch_reg;
282 reg->idx = EXTRA_REG_LBR; 368 reg->idx = EXTRA_REG_LBR;
@@ -287,18 +373,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
287 return 0; 373 return 0;
288} 374}
289 375
290/*
291 * all the bits supported on some flavor of x86LBR
292 * we ignore BRANCH_HV because it is not supported
293 */
294#define PERF_SAMPLE_BRANCH_X86_ALL \
295 (PERF_SAMPLE_BRANCH_ANY |\
296 PERF_SAMPLE_BRANCH_USER |\
297 PERF_SAMPLE_BRANCH_KERNEL)
298
299int intel_pmu_setup_lbr_filter(struct perf_event *event) 376int intel_pmu_setup_lbr_filter(struct perf_event *event)
300{ 377{
301 u64 br_type = event->attr.branch_sample_type; 378 int ret = 0;
302 379
303 /* 380 /*
304 * no LBR on this PMU 381 * no LBR on this PMU
@@ -307,20 +384,210 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
307 return -EOPNOTSUPP; 384 return -EOPNOTSUPP;
308 385
309 /* 386 /*
310 * if no LBR HW filter, users can only 387 * setup SW LBR filter
311 * capture all branches
312 */ 388 */
313 if (!x86_pmu.lbr_sel_map) { 389 intel_pmu_setup_sw_lbr_filter(event);
314 if (br_type != PERF_SAMPLE_BRANCH_X86_ALL) 390
315 return -EOPNOTSUPP; 391 /*
316 return 0; 392 * setup HW LBR filter, if any
393 */
394 if (x86_pmu.lbr_sel_map)
395 ret = intel_pmu_setup_hw_lbr_filter(event);
396
397 return ret;
398}
399
400/*
401 * return the type of control flow change at address "from"
402 * intruction is not necessarily a branch (in case of interrupt).
403 *
404 * The branch type returned also includes the priv level of the
405 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
406 *
407 * If a branch type is unknown OR the instruction cannot be
408 * decoded (e.g., text page not present), then X86_BR_NONE is
409 * returned.
410 */
411static int branch_type(unsigned long from, unsigned long to)
412{
413 struct insn insn;
414 void *addr;
415 int bytes, size = MAX_INSN_SIZE;
416 int ret = X86_BR_NONE;
417 int ext, to_plm, from_plm;
418 u8 buf[MAX_INSN_SIZE];
419 int is64 = 0;
420
421 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
422 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
423
424 /*
425 * maybe zero if lbr did not fill up after a reset by the time
426 * we get a PMU interrupt
427 */
428 if (from == 0 || to == 0)
429 return X86_BR_NONE;
430
431 if (from_plm == X86_BR_USER) {
432 /*
433 * can happen if measuring at the user level only
434 * and we interrupt in a kernel thread, e.g., idle.
435 */
436 if (!current->mm)
437 return X86_BR_NONE;
438
439 /* may fail if text not present */
440 bytes = copy_from_user_nmi(buf, (void __user *)from, size);
441 if (bytes != size)
442 return X86_BR_NONE;
443
444 addr = buf;
445 } else
446 addr = (void *)from;
447
448 /*
449 * decoder needs to know the ABI especially
450 * on 64-bit systems running 32-bit apps
451 */
452#ifdef CONFIG_X86_64
453 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
454#endif
455 insn_init(&insn, addr, is64);
456 insn_get_opcode(&insn);
457
458 switch (insn.opcode.bytes[0]) {
459 case 0xf:
460 switch (insn.opcode.bytes[1]) {
461 case 0x05: /* syscall */
462 case 0x34: /* sysenter */
463 ret = X86_BR_SYSCALL;
464 break;
465 case 0x07: /* sysret */
466 case 0x35: /* sysexit */
467 ret = X86_BR_SYSRET;
468 break;
469 case 0x80 ... 0x8f: /* conditional */
470 ret = X86_BR_JCC;
471 break;
472 default:
473 ret = X86_BR_NONE;
474 }
475 break;
476 case 0x70 ... 0x7f: /* conditional */
477 ret = X86_BR_JCC;
478 break;
479 case 0xc2: /* near ret */
480 case 0xc3: /* near ret */
481 case 0xca: /* far ret */
482 case 0xcb: /* far ret */
483 ret = X86_BR_RET;
484 break;
485 case 0xcf: /* iret */
486 ret = X86_BR_IRET;
487 break;
488 case 0xcc ... 0xce: /* int */
489 ret = X86_BR_INT;
490 break;
491 case 0xe8: /* call near rel */
492 case 0x9a: /* call far absolute */
493 ret = X86_BR_CALL;
494 break;
495 case 0xe0 ... 0xe3: /* loop jmp */
496 ret = X86_BR_JCC;
497 break;
498 case 0xe9 ... 0xeb: /* jmp */
499 ret = X86_BR_JMP;
500 break;
501 case 0xff: /* call near absolute, call far absolute ind */
502 insn_get_modrm(&insn);
503 ext = (insn.modrm.bytes[0] >> 3) & 0x7;
504 switch (ext) {
505 case 2: /* near ind call */
506 case 3: /* far ind call */
507 ret = X86_BR_IND_CALL;
508 break;
509 case 4:
510 case 5:
511 ret = X86_BR_JMP;
512 break;
513 }
514 break;
515 default:
516 ret = X86_BR_NONE;
317 } 517 }
318 /* 518 /*
319 * we ignore branch priv levels we do not 519 * interrupts, traps, faults (and thus ring transition) may
320 * know about: BRANCH_HV 520 * occur on any instructions. Thus, to classify them correctly,
521 * we need to first look at the from and to priv levels. If they
522 * are different and to is in the kernel, then it indicates
523 * a ring transition. If the from instruction is not a ring
524 * transition instr (syscall, systenter, int), then it means
525 * it was a irq, trap or fault.
526 *
527 * we have no way of detecting kernel to kernel faults.
528 */
529 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
530 && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
531 ret = X86_BR_IRQ;
532
533 /*
534 * branch priv level determined by target as
535 * is done by HW when LBR_SELECT is implemented
321 */ 536 */
537 if (ret != X86_BR_NONE)
538 ret |= to_plm;
322 539
323 return intel_pmu_setup_hw_lbr_filter(event); 540 return ret;
541}
542
543/*
544 * implement actual branch filter based on user demand.
545 * Hardware may not exactly satisfy that request, thus
546 * we need to inspect opcodes. Mismatched branches are
547 * discarded. Therefore, the number of branches returned
548 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
549 */
550static void
551intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
552{
553 u64 from, to;
554 int br_sel = cpuc->br_sel;
555 int i, j, type;
556 bool compress = false;
557
558 /* if sampling all branches, then nothing to filter */
559 if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
560 return;
561
562 for (i = 0; i < cpuc->lbr_stack.nr; i++) {
563
564 from = cpuc->lbr_entries[i].from;
565 to = cpuc->lbr_entries[i].to;
566
567 type = branch_type(from, to);
568
569 /* if type does not correspond, then discard */
570 if (type == X86_BR_NONE || (br_sel & type) != type) {
571 cpuc->lbr_entries[i].from = 0;
572 compress = true;
573 }
574 }
575
576 if (!compress)
577 return;
578
579 /* remove all entries with from=0 */
580 for (i = 0; i < cpuc->lbr_stack.nr; ) {
581 if (!cpuc->lbr_entries[i].from) {
582 j = i;
583 while (++j < cpuc->lbr_stack.nr)
584 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
585 cpuc->lbr_stack.nr--;
586 if (!cpuc->lbr_entries[i].from)
587 continue;
588 }
589 i++;
590 }
324} 591}
325 592
326/* 593/*
@@ -363,6 +630,10 @@ void intel_pmu_lbr_init_core(void)
363 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 630 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
364 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 631 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
365 632
633 /*
634 * SW branch filter usage:
635 * - compensate for lack of HW filter
636 */
366 pr_cont("4-deep LBR, "); 637 pr_cont("4-deep LBR, ");
367} 638}
368 639
@@ -377,6 +648,13 @@ void intel_pmu_lbr_init_nhm(void)
377 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 648 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
378 x86_pmu.lbr_sel_map = nhm_lbr_sel_map; 649 x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
379 650
651 /*
652 * SW branch filter usage:
653 * - workaround LBR_SEL errata (see above)
654 * - support syscall, sysret capture.
655 * That requires LBR_FAR but that means far
656 * jmp need to be filtered out
657 */
380 pr_cont("16-deep LBR, "); 658 pr_cont("16-deep LBR, ");
381} 659}
382 660
@@ -391,6 +669,12 @@ void intel_pmu_lbr_init_snb(void)
391 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 669 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
392 x86_pmu.lbr_sel_map = snb_lbr_sel_map; 670 x86_pmu.lbr_sel_map = snb_lbr_sel_map;
393 671
672 /*
673 * SW branch filter usage:
674 * - support syscall, sysret capture.
675 * That requires LBR_FAR but that means far
676 * jmp need to be filtered out
677 */
394 pr_cont("16-deep LBR, "); 678 pr_cont("16-deep LBR, ");
395} 679}
396 680
@@ -412,5 +696,9 @@ void intel_pmu_lbr_init_atom(void)
412 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 696 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
413 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 697 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
414 698
699 /*
700 * SW branch filter usage:
701 * - compensate for lack of HW filter
702 */
415 pr_cont("8-deep LBR, "); 703 pr_cont("8-deep LBR, ");
416} 704}