diff options
author | Kan Liang <kan.liang@intel.com> | 2015-01-05 13:23:05 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-02-18 11:16:18 -0500 |
commit | 384b60557b5522fcb99646f0eb6e7a344cdb94c6 (patch) | |
tree | 20da511149037cf1066c4f76ff9d84963bec12d0 /tools/perf | |
parent | aad2b21c151273fa7abc419dac51a980eff1dd17 (diff) |
perf tools: Construct LBR call chain
LBR call stack only has user-space callchains. It is output in the
PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's
still in the form of PERF_SAMPLE_CALLCHAIN.
The perf tool has to handle both data sources to construct a
complete callstack.
For the "perf report -D" option, both lbr and fp information will be
displayed.
A new call chain recording option "lbr" is introduced into the perf
tool for LBR call stack. The user can use --call-graph lbr to get
the call stack information from hardware.
Here are some examples.
When profiling bc(1) on Fedora 19:
echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd
If enabling LBR, perf report output looks like:
50.36% bc bc [.] bc_divide
|
--- bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
33.66% bc bc [.] _one_mult
|
--- _one_mult
bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
7.62% bc bc [.] _bc_do_add
|
--- _bc_do_add
|
|--99.89%-- 0x2000186a8
--0.11%-- [...]
6.83% bc bc [.] _bc_do_sub
|
--- _bc_do_sub
|
|--99.94%-- bc_add
| execute
| run_code
| yyparse
| main
| __libc_start_main
| _start
--0.06%-- [...]
0.46% bc libc-2.17.so [.] __memset_sse2
|
--- __memset_sse2
|
|--54.13%-- bc_new_num
| |
| |--51.00%-- bc_divide
| | execute
| | run_code
| | yyparse
| | main
| | __libc_start_main
| | _start
| |
| |--30.46%-- _bc_do_sub
| | bc_add
| | execute
| | run_code
| | yyparse
| | main
| | __libc_start_main
| | _start
| |
| --18.55%-- _bc_do_add
| bc_add
| execute
| run_code
| yyparse
| main
| __libc_start_main
| _start
|
--45.87%-- bc_divide
execute
run_code
yyparse
main
__libc_start_main
_start
If using FP, perf report output looks like:
echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd
50.49% bc bc [.] bc_divide
|
--- bc_divide
33.57% bc bc [.] _one_mult
|
--- _one_mult
7.61% bc bc [.] _bc_do_add
|
--- _bc_do_add
0x2000186a8
6.88% bc bc [.] _bc_do_sub
|
--- _bc_do_sub
0.42% bc libc-2.17.so [.] __memcpy_ssse3_back
|
--- __memcpy_ssse3_back
If using LBR, perf report -D output looks like:
3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0
... LBR call chain: nr:8
..... 0: fffffffffffffe00
..... 1: 0000000000408e50
..... 2: 000000000040a458
..... 3: 000000000040562e
..... 4: 0000000000408590
..... 5: 00000000004022c0
..... 6: 00000000004015dd
..... 7: 0000003d1cc21b43
... FP chain: nr:2
..... 0: fffffffffffffe00
..... 1: 0000000000408ea8
... thread: bc:9748
...... dso: /usr/bin/bc
The LBR call stack has the following known limitations:
- Zero length calls are not filtered out by the hardware
- Exception handing such as setjmp/longjmp will have calls/returns not
match
- Pushing different return address onto the stack will have
calls/returns not match
- If callstack is deeper than the LBR, only the last entries are
captured
Tested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Simon Que <sque@chromium.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'tools/perf')
-rw-r--r-- | tools/perf/util/evsel.h | 4 | ||||
-rw-r--r-- | tools/perf/util/machine.c | 102 | ||||
-rw-r--r-- | tools/perf/util/session.c | 64 |
3 files changed, 153 insertions, 17 deletions
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 38622747d130..dcf202aebe9f 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h | |||
@@ -355,4 +355,8 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \ | |||
355 | (_evsel) && (_evsel)->leader == (_leader); \ | 355 | (_evsel) && (_evsel)->leader == (_leader); \ |
356 | (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node)) | 356 | (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node)) |
357 | 357 | ||
358 | static inline bool has_branch_callstack(struct perf_evsel *evsel) | ||
359 | { | ||
360 | return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; | ||
361 | } | ||
358 | #endif /* __PERF_EVSEL_H */ | 362 | #endif /* __PERF_EVSEL_H */ |
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 1bca3a9f2b16..9e0f60a7e7b3 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c | |||
@@ -1502,18 +1502,100 @@ static int remove_loops(struct branch_entry *l, int nr) | |||
1502 | return nr; | 1502 | return nr; |
1503 | } | 1503 | } |
1504 | 1504 | ||
1505 | static int thread__resolve_callchain_sample(struct thread *thread, | 1505 | /* |
1506 | struct ip_callchain *chain, | 1506 | * Recolve LBR callstack chain sample |
1507 | struct branch_stack *branch, | 1507 | * Return: |
1508 | struct symbol **parent, | 1508 | * 1 on success get LBR callchain information |
1509 | struct addr_location *root_al, | 1509 | * 0 no available LBR callchain information, should try fp |
1510 | int max_stack) | 1510 | * negative error code on other errors. |
1511 | */ | ||
1512 | static int resolve_lbr_callchain_sample(struct thread *thread, | ||
1513 | struct perf_sample *sample, | ||
1514 | struct symbol **parent, | ||
1515 | struct addr_location *root_al, | ||
1516 | int max_stack) | ||
1511 | { | 1517 | { |
1518 | struct ip_callchain *chain = sample->callchain; | ||
1519 | int chain_nr = min(max_stack, (int)chain->nr); | ||
1520 | int i, j, err; | ||
1521 | u64 ip; | ||
1522 | |||
1523 | for (i = 0; i < chain_nr; i++) { | ||
1524 | if (chain->ips[i] == PERF_CONTEXT_USER) | ||
1525 | break; | ||
1526 | } | ||
1527 | |||
1528 | /* LBR only affects the user callchain */ | ||
1529 | if (i != chain_nr) { | ||
1530 | struct branch_stack *lbr_stack = sample->branch_stack; | ||
1531 | int lbr_nr = lbr_stack->nr; | ||
1532 | /* | ||
1533 | * LBR callstack can only get user call chain. | ||
1534 | * The mix_chain_nr is kernel call chain | ||
1535 | * number plus LBR user call chain number. | ||
1536 | * i is kernel call chain number, | ||
1537 | * 1 is PERF_CONTEXT_USER, | ||
1538 | * lbr_nr + 1 is the user call chain number. | ||
1539 | * For details, please refer to the comments | ||
1540 | * in callchain__printf | ||
1541 | */ | ||
1542 | int mix_chain_nr = i + 1 + lbr_nr + 1; | ||
1543 | |||
1544 | if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) { | ||
1545 | pr_warning("corrupted callchain. skipping...\n"); | ||
1546 | return 0; | ||
1547 | } | ||
1548 | |||
1549 | for (j = 0; j < mix_chain_nr; j++) { | ||
1550 | if (callchain_param.order == ORDER_CALLEE) { | ||
1551 | if (j < i + 1) | ||
1552 | ip = chain->ips[j]; | ||
1553 | else if (j > i + 1) | ||
1554 | ip = lbr_stack->entries[j - i - 2].from; | ||
1555 | else | ||
1556 | ip = lbr_stack->entries[0].to; | ||
1557 | } else { | ||
1558 | if (j < lbr_nr) | ||
1559 | ip = lbr_stack->entries[lbr_nr - j - 1].from; | ||
1560 | else if (j > lbr_nr) | ||
1561 | ip = chain->ips[i + 1 - (j - lbr_nr)]; | ||
1562 | else | ||
1563 | ip = lbr_stack->entries[0].to; | ||
1564 | } | ||
1565 | |||
1566 | err = add_callchain_ip(thread, parent, root_al, false, ip); | ||
1567 | if (err) | ||
1568 | return (err < 0) ? err : 0; | ||
1569 | } | ||
1570 | return 1; | ||
1571 | } | ||
1572 | |||
1573 | return 0; | ||
1574 | } | ||
1575 | |||
1576 | static int thread__resolve_callchain_sample(struct thread *thread, | ||
1577 | struct perf_evsel *evsel, | ||
1578 | struct perf_sample *sample, | ||
1579 | struct symbol **parent, | ||
1580 | struct addr_location *root_al, | ||
1581 | int max_stack) | ||
1582 | { | ||
1583 | struct branch_stack *branch = sample->branch_stack; | ||
1584 | struct ip_callchain *chain = sample->callchain; | ||
1512 | int chain_nr = min(max_stack, (int)chain->nr); | 1585 | int chain_nr = min(max_stack, (int)chain->nr); |
1513 | int i, j, err; | 1586 | int i, j, err; |
1514 | int skip_idx = -1; | 1587 | int skip_idx = -1; |
1515 | int first_call = 0; | 1588 | int first_call = 0; |
1516 | 1589 | ||
1590 | callchain_cursor_reset(&callchain_cursor); | ||
1591 | |||
1592 | if (has_branch_callstack(evsel)) { | ||
1593 | err = resolve_lbr_callchain_sample(thread, sample, parent, | ||
1594 | root_al, max_stack); | ||
1595 | if (err) | ||
1596 | return (err < 0) ? err : 0; | ||
1597 | } | ||
1598 | |||
1517 | /* | 1599 | /* |
1518 | * Based on DWARF debug information, some architectures skip | 1600 | * Based on DWARF debug information, some architectures skip |
1519 | * a callchain entry saved by the kernel. | 1601 | * a callchain entry saved by the kernel. |
@@ -1521,8 +1603,6 @@ static int thread__resolve_callchain_sample(struct thread *thread, | |||
1521 | if (chain->nr < PERF_MAX_STACK_DEPTH) | 1603 | if (chain->nr < PERF_MAX_STACK_DEPTH) |
1522 | skip_idx = arch_skip_callchain_idx(thread, chain); | 1604 | skip_idx = arch_skip_callchain_idx(thread, chain); |
1523 | 1605 | ||
1524 | callchain_cursor_reset(&callchain_cursor); | ||
1525 | |||
1526 | /* | 1606 | /* |
1527 | * Add branches to call stack for easier browsing. This gives | 1607 | * Add branches to call stack for easier browsing. This gives |
1528 | * more context for a sample than just the callers. | 1608 | * more context for a sample than just the callers. |
@@ -1623,9 +1703,9 @@ int thread__resolve_callchain(struct thread *thread, | |||
1623 | struct addr_location *root_al, | 1703 | struct addr_location *root_al, |
1624 | int max_stack) | 1704 | int max_stack) |
1625 | { | 1705 | { |
1626 | int ret = thread__resolve_callchain_sample(thread, sample->callchain, | 1706 | int ret = thread__resolve_callchain_sample(thread, evsel, |
1627 | sample->branch_stack, | 1707 | sample, parent, |
1628 | parent, root_al, max_stack); | 1708 | root_al, max_stack); |
1629 | if (ret) | 1709 | if (ret) |
1630 | return ret; | 1710 | return ret; |
1631 | 1711 | ||
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0baf75f12b7c..504b7e664e6c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c | |||
@@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session *s, union perf_event *event, | |||
553 | return 0; | 553 | return 0; |
554 | } | 554 | } |
555 | 555 | ||
556 | static void callchain__printf(struct perf_sample *sample) | 556 | static void callchain__lbr_callstack_printf(struct perf_sample *sample) |
557 | { | 557 | { |
558 | struct ip_callchain *callchain = sample->callchain; | ||
559 | struct branch_stack *lbr_stack = sample->branch_stack; | ||
560 | u64 kernel_callchain_nr = callchain->nr; | ||
558 | unsigned int i; | 561 | unsigned int i; |
559 | 562 | ||
560 | printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr); | 563 | for (i = 0; i < kernel_callchain_nr; i++) { |
564 | if (callchain->ips[i] == PERF_CONTEXT_USER) | ||
565 | break; | ||
566 | } | ||
567 | |||
568 | if ((i != kernel_callchain_nr) && lbr_stack->nr) { | ||
569 | u64 total_nr; | ||
570 | /* | ||
571 | * LBR callstack can only get user call chain, | ||
572 | * i is kernel call chain number, | ||
573 | * 1 is PERF_CONTEXT_USER. | ||
574 | * | ||
575 | * The user call chain is stored in LBR registers. | ||
576 | * LBR are pair registers. The caller is stored | ||
577 | * in "from" register, while the callee is stored | ||
578 | * in "to" register. | ||
579 | * For example, there is a call stack | ||
580 | * "A"->"B"->"C"->"D". | ||
581 | * The LBR registers will recorde like | ||
582 | * "C"->"D", "B"->"C", "A"->"B". | ||
583 | * So only the first "to" register and all "from" | ||
584 | * registers are needed to construct the whole stack. | ||
585 | */ | ||
586 | total_nr = i + 1 + lbr_stack->nr + 1; | ||
587 | kernel_callchain_nr = i + 1; | ||
588 | |||
589 | printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr); | ||
590 | |||
591 | for (i = 0; i < kernel_callchain_nr; i++) | ||
592 | printf("..... %2d: %016" PRIx64 "\n", | ||
593 | i, callchain->ips[i]); | ||
594 | |||
595 | printf("..... %2d: %016" PRIx64 "\n", | ||
596 | (int)(kernel_callchain_nr), lbr_stack->entries[0].to); | ||
597 | for (i = 0; i < lbr_stack->nr; i++) | ||
598 | printf("..... %2d: %016" PRIx64 "\n", | ||
599 | (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from); | ||
600 | } | ||
601 | } | ||
602 | |||
603 | static void callchain__printf(struct perf_evsel *evsel, | ||
604 | struct perf_sample *sample) | ||
605 | { | ||
606 | unsigned int i; | ||
607 | struct ip_callchain *callchain = sample->callchain; | ||
608 | |||
609 | if (has_branch_callstack(evsel)) | ||
610 | callchain__lbr_callstack_printf(sample); | ||
611 | |||
612 | printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); | ||
561 | 613 | ||
562 | for (i = 0; i < sample->callchain->nr; i++) | 614 | for (i = 0; i < callchain->nr; i++) |
563 | printf("..... %2d: %016" PRIx64 "\n", | 615 | printf("..... %2d: %016" PRIx64 "\n", |
564 | i, sample->callchain->ips[i]); | 616 | i, callchain->ips[i]); |
565 | } | 617 | } |
566 | 618 | ||
567 | static void branch_stack__printf(struct perf_sample *sample) | 619 | static void branch_stack__printf(struct perf_sample *sample) |
@@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event, | |||
718 | sample_type = evsel->attr.sample_type; | 770 | sample_type = evsel->attr.sample_type; |
719 | 771 | ||
720 | if (sample_type & PERF_SAMPLE_CALLCHAIN) | 772 | if (sample_type & PERF_SAMPLE_CALLCHAIN) |
721 | callchain__printf(sample); | 773 | callchain__printf(evsel, sample); |
722 | 774 | ||
723 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) | 775 | if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel)) |
724 | branch_stack__printf(sample); | 776 | branch_stack__printf(sample); |
725 | 777 | ||
726 | if (sample_type & PERF_SAMPLE_REGS_USER) | 778 | if (sample_type & PERF_SAMPLE_REGS_USER) |