aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf
diff options
context:
space:
mode:
authorKan Liang <kan.liang@intel.com>2015-01-05 13:23:05 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-18 11:16:18 -0500
commit384b60557b5522fcb99646f0eb6e7a344cdb94c6 (patch)
tree20da511149037cf1066c4f76ff9d84963bec12d0 /tools/perf
parentaad2b21c151273fa7abc419dac51a980eff1dd17 (diff)
perf tools: Construct LBR call chain
LBR call stack only has user-space callchains. It is output in the PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's still in the form of PERF_SAMPLE_CALLCHAIN. The perf tool has to handle both data sources to construct a complete callstack. For the "perf report -D" option, both lbr and fp information will be displayed. A new call chain recording option "lbr" is introduced into the perf tool for LBR call stack. The user can use --call-graph lbr to get the call stack information from hardware. Here are some examples. When profiling bc(1) on Fedora 19: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd If enabling LBR, perf report output looks like: 50.36% bc bc [.] bc_divide | --- bc_divide execute run_code yyparse main __libc_start_main _start 33.66% bc bc [.] _one_mult | --- _one_mult bc_divide execute run_code yyparse main __libc_start_main _start 7.62% bc bc [.] _bc_do_add | --- _bc_do_add | |--99.89%-- 0x2000186a8 --0.11%-- [...] 6.83% bc bc [.] _bc_do_sub | --- _bc_do_sub | |--99.94%-- bc_add | execute | run_code | yyparse | main | __libc_start_main | _start --0.06%-- [...] 0.46% bc libc-2.17.so [.] __memset_sse2 | --- __memset_sse2 | |--54.13%-- bc_new_num | | | |--51.00%-- bc_divide | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | |--30.46%-- _bc_do_sub | | bc_add | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | --18.55%-- _bc_do_add | bc_add | execute | run_code | yyparse | main | __libc_start_main | _start | --45.87%-- bc_divide execute run_code yyparse main __libc_start_main _start If using FP, perf report output looks like: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd 50.49% bc bc [.] bc_divide | --- bc_divide 33.57% bc bc [.] _one_mult | --- _one_mult 7.61% bc bc [.] _bc_do_add | --- _bc_do_add 0x2000186a8 6.88% bc bc [.] _bc_do_sub | --- _bc_do_sub 0.42% bc libc-2.17.so [.] __memcpy_ssse3_back | --- __memcpy_ssse3_back If using LBR, perf report -D output looks like: 3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0 ... LBR call chain: nr:8 ..... 0: fffffffffffffe00 ..... 1: 0000000000408e50 ..... 2: 000000000040a458 ..... 3: 000000000040562e ..... 4: 0000000000408590 ..... 5: 00000000004022c0 ..... 6: 00000000004015dd ..... 7: 0000003d1cc21b43 ... FP chain: nr:2 ..... 0: fffffffffffffe00 ..... 1: 0000000000408ea8 ... thread: bc:9748 ...... dso: /usr/bin/bc The LBR call stack has the following known limitations: - Zero length calls are not filtered out by the hardware - Exception handing such as setjmp/longjmp will have calls/returns not match - Pushing different return address onto the stack will have calls/returns not match - If callstack is deeper than the LBR, only the last entries are captured Tested-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Kan Liang <kan.liang@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Borislav Petkov <bp@suse.de> Cc: David Ahern <dsahern@gmail.com> Cc: Don Zickus <dzickus@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Simon Que <sque@chromium.org> Cc: Stephane Eranian <eranian@google.com> Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'tools/perf')
-rw-r--r--tools/perf/util/evsel.h4
-rw-r--r--tools/perf/util/machine.c102
-rw-r--r--tools/perf/util/session.c64
3 files changed, 153 insertions, 17 deletions
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 38622747d130..dcf202aebe9f 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -355,4 +355,8 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \
355 (_evsel) && (_evsel)->leader == (_leader); \ 355 (_evsel) && (_evsel)->leader == (_leader); \
356 (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node)) 356 (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
357 357
358static inline bool has_branch_callstack(struct perf_evsel *evsel)
359{
360 return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
361}
358#endif /* __PERF_EVSEL_H */ 362#endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1bca3a9f2b16..9e0f60a7e7b3 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1502,18 +1502,100 @@ static int remove_loops(struct branch_entry *l, int nr)
1502 return nr; 1502 return nr;
1503} 1503}
1504 1504
1505static int thread__resolve_callchain_sample(struct thread *thread, 1505/*
1506 struct ip_callchain *chain, 1506 * Recolve LBR callstack chain sample
1507 struct branch_stack *branch, 1507 * Return:
1508 struct symbol **parent, 1508 * 1 on success get LBR callchain information
1509 struct addr_location *root_al, 1509 * 0 no available LBR callchain information, should try fp
1510 int max_stack) 1510 * negative error code on other errors.
1511 */
1512static int resolve_lbr_callchain_sample(struct thread *thread,
1513 struct perf_sample *sample,
1514 struct symbol **parent,
1515 struct addr_location *root_al,
1516 int max_stack)
1511{ 1517{
1518 struct ip_callchain *chain = sample->callchain;
1519 int chain_nr = min(max_stack, (int)chain->nr);
1520 int i, j, err;
1521 u64 ip;
1522
1523 for (i = 0; i < chain_nr; i++) {
1524 if (chain->ips[i] == PERF_CONTEXT_USER)
1525 break;
1526 }
1527
1528 /* LBR only affects the user callchain */
1529 if (i != chain_nr) {
1530 struct branch_stack *lbr_stack = sample->branch_stack;
1531 int lbr_nr = lbr_stack->nr;
1532 /*
1533 * LBR callstack can only get user call chain.
1534 * The mix_chain_nr is kernel call chain
1535 * number plus LBR user call chain number.
1536 * i is kernel call chain number,
1537 * 1 is PERF_CONTEXT_USER,
1538 * lbr_nr + 1 is the user call chain number.
1539 * For details, please refer to the comments
1540 * in callchain__printf
1541 */
1542 int mix_chain_nr = i + 1 + lbr_nr + 1;
1543
1544 if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
1545 pr_warning("corrupted callchain. skipping...\n");
1546 return 0;
1547 }
1548
1549 for (j = 0; j < mix_chain_nr; j++) {
1550 if (callchain_param.order == ORDER_CALLEE) {
1551 if (j < i + 1)
1552 ip = chain->ips[j];
1553 else if (j > i + 1)
1554 ip = lbr_stack->entries[j - i - 2].from;
1555 else
1556 ip = lbr_stack->entries[0].to;
1557 } else {
1558 if (j < lbr_nr)
1559 ip = lbr_stack->entries[lbr_nr - j - 1].from;
1560 else if (j > lbr_nr)
1561 ip = chain->ips[i + 1 - (j - lbr_nr)];
1562 else
1563 ip = lbr_stack->entries[0].to;
1564 }
1565
1566 err = add_callchain_ip(thread, parent, root_al, false, ip);
1567 if (err)
1568 return (err < 0) ? err : 0;
1569 }
1570 return 1;
1571 }
1572
1573 return 0;
1574}
1575
1576static int thread__resolve_callchain_sample(struct thread *thread,
1577 struct perf_evsel *evsel,
1578 struct perf_sample *sample,
1579 struct symbol **parent,
1580 struct addr_location *root_al,
1581 int max_stack)
1582{
1583 struct branch_stack *branch = sample->branch_stack;
1584 struct ip_callchain *chain = sample->callchain;
1512 int chain_nr = min(max_stack, (int)chain->nr); 1585 int chain_nr = min(max_stack, (int)chain->nr);
1513 int i, j, err; 1586 int i, j, err;
1514 int skip_idx = -1; 1587 int skip_idx = -1;
1515 int first_call = 0; 1588 int first_call = 0;
1516 1589
1590 callchain_cursor_reset(&callchain_cursor);
1591
1592 if (has_branch_callstack(evsel)) {
1593 err = resolve_lbr_callchain_sample(thread, sample, parent,
1594 root_al, max_stack);
1595 if (err)
1596 return (err < 0) ? err : 0;
1597 }
1598
1517 /* 1599 /*
1518 * Based on DWARF debug information, some architectures skip 1600 * Based on DWARF debug information, some architectures skip
1519 * a callchain entry saved by the kernel. 1601 * a callchain entry saved by the kernel.
@@ -1521,8 +1603,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
1521 if (chain->nr < PERF_MAX_STACK_DEPTH) 1603 if (chain->nr < PERF_MAX_STACK_DEPTH)
1522 skip_idx = arch_skip_callchain_idx(thread, chain); 1604 skip_idx = arch_skip_callchain_idx(thread, chain);
1523 1605
1524 callchain_cursor_reset(&callchain_cursor);
1525
1526 /* 1606 /*
1527 * Add branches to call stack for easier browsing. This gives 1607 * Add branches to call stack for easier browsing. This gives
1528 * more context for a sample than just the callers. 1608 * more context for a sample than just the callers.
@@ -1623,9 +1703,9 @@ int thread__resolve_callchain(struct thread *thread,
1623 struct addr_location *root_al, 1703 struct addr_location *root_al,
1624 int max_stack) 1704 int max_stack)
1625{ 1705{
1626 int ret = thread__resolve_callchain_sample(thread, sample->callchain, 1706 int ret = thread__resolve_callchain_sample(thread, evsel,
1627 sample->branch_stack, 1707 sample, parent,
1628 parent, root_al, max_stack); 1708 root_al, max_stack);
1629 if (ret) 1709 if (ret)
1630 return ret; 1710 return ret;
1631 1711
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0baf75f12b7c..504b7e664e6c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session *s, union perf_event *event,
553 return 0; 553 return 0;
554} 554}
555 555
556static void callchain__printf(struct perf_sample *sample) 556static void callchain__lbr_callstack_printf(struct perf_sample *sample)
557{ 557{
558 struct ip_callchain *callchain = sample->callchain;
559 struct branch_stack *lbr_stack = sample->branch_stack;
560 u64 kernel_callchain_nr = callchain->nr;
558 unsigned int i; 561 unsigned int i;
559 562
560 printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr); 563 for (i = 0; i < kernel_callchain_nr; i++) {
564 if (callchain->ips[i] == PERF_CONTEXT_USER)
565 break;
566 }
567
568 if ((i != kernel_callchain_nr) && lbr_stack->nr) {
569 u64 total_nr;
570 /*
571 * LBR callstack can only get user call chain,
572 * i is kernel call chain number,
573 * 1 is PERF_CONTEXT_USER.
574 *
575 * The user call chain is stored in LBR registers.
576 * LBR are pair registers. The caller is stored
577 * in "from" register, while the callee is stored
578 * in "to" register.
579 * For example, there is a call stack
580 * "A"->"B"->"C"->"D".
581 * The LBR registers will recorde like
582 * "C"->"D", "B"->"C", "A"->"B".
583 * So only the first "to" register and all "from"
584 * registers are needed to construct the whole stack.
585 */
586 total_nr = i + 1 + lbr_stack->nr + 1;
587 kernel_callchain_nr = i + 1;
588
589 printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
590
591 for (i = 0; i < kernel_callchain_nr; i++)
592 printf("..... %2d: %016" PRIx64 "\n",
593 i, callchain->ips[i]);
594
595 printf("..... %2d: %016" PRIx64 "\n",
596 (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
597 for (i = 0; i < lbr_stack->nr; i++)
598 printf("..... %2d: %016" PRIx64 "\n",
599 (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
600 }
601}
602
603static void callchain__printf(struct perf_evsel *evsel,
604 struct perf_sample *sample)
605{
606 unsigned int i;
607 struct ip_callchain *callchain = sample->callchain;
608
609 if (has_branch_callstack(evsel))
610 callchain__lbr_callstack_printf(sample);
611
612 printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
561 613
562 for (i = 0; i < sample->callchain->nr; i++) 614 for (i = 0; i < callchain->nr; i++)
563 printf("..... %2d: %016" PRIx64 "\n", 615 printf("..... %2d: %016" PRIx64 "\n",
564 i, sample->callchain->ips[i]); 616 i, callchain->ips[i]);
565} 617}
566 618
567static void branch_stack__printf(struct perf_sample *sample) 619static void branch_stack__printf(struct perf_sample *sample)
@@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
718 sample_type = evsel->attr.sample_type; 770 sample_type = evsel->attr.sample_type;
719 771
720 if (sample_type & PERF_SAMPLE_CALLCHAIN) 772 if (sample_type & PERF_SAMPLE_CALLCHAIN)
721 callchain__printf(sample); 773 callchain__printf(evsel, sample);
722 774
723 if (sample_type & PERF_SAMPLE_BRANCH_STACK) 775 if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
724 branch_stack__printf(sample); 776 branch_stack__printf(sample);
725 777
726 if (sample_type & PERF_SAMPLE_REGS_USER) 778 if (sample_type & PERF_SAMPLE_REGS_USER)