aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/util/thread-stack.c
diff options
context:
space:
mode:
authorAdrian Hunter <adrian.hunter@intel.com>2019-01-09 04:18:33 -0500
committerArnaldo Carvalho de Melo <acme@redhat.com>2019-02-06 08:00:40 -0500
commitf08046cb3082b313e7b08dc35838cf8bd902c36b (patch)
tree01ab2af620c66c6f595e14b235299679cabf24e0 /tools/perf/util/thread-stack.c
parent90c2cda7056e3a7555d874a27aae12fd46ca802e (diff)
perf thread-stack: Represent jmps to the start of a different symbol
The compiler might optimize a call/ret combination by making it a jmp. However the thread-stack does not presently cater for that, so that such control flow is not visible in the call graph. Make it visible by recording on the stack a branch to the start of a different symbol. Note, that means when a ret pops the stack, all jmps must be popped off first. Example: $ cat jmp-to-fn.c __attribute__((noinline)) int bar(void) { return -1; } __attribute__((noinline)) int foo(void) { return bar() + 1; } int main() { return foo(); } $ gcc -ggdb3 -Wall -Wextra -O2 -o jmp-to-fn jmp-to-fn.c $ objdump -d jmp-to-fn <SNIP> 0000000000001040 <main>: 1040: 31 c0 xor %eax,%eax 1042: e9 09 01 00 00 jmpq 1150 <foo> <SNIP> 0000000000001140 <bar>: 1140: b8 ff ff ff ff mov $0xffffffff,%eax 1145: c3 retq <SNIP> 0000000000001150 <foo>: 1150: 31 c0 xor %eax,%eax 1152: e8 e9 ff ff ff callq 1140 <bar> 1157: 83 c0 01 add $0x1,%eax 115a: c3 retq <SNIP> $ perf record -o jmp-to-fn.perf.data -e intel_pt/cyc/u ./jmp-to-fn [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0,017 MB jmp-to-fn.perf.data ] $ perf script -i jmp-to-fn.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py jmp-to-fn.db branches calls 2019-01-08 13:24:58.783069 Creating database... 2019-01-08 13:24:58.794650 Writing records... 2019-01-08 13:24:59.008050 Adding indexes 2019-01-08 13:24:59.015802 Done $ ~/libexec/perf-core/scripts/python/exported-sql-viewer.py jmp-to-fn.db Before: main -> bar After: main -> foo -> bar Committer testing: Install the python2-pyside package, then select these menu options on the GUI: "Reports" "Context sensitive callgraphs" Then go on expanding the symbols, to get, full picture when doing this on a fedora:29 with gcc version 8.2.1 20181215 (Red Hat 8.2.1-6) (GCC): jmp-to-fn PID:TID _start (ld-2.28.so) __libc_start_main main foo bar To verify that indeed, this fixes the problem. Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Link: http://lkml.kernel.org/r/20190109091835.5570-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/util/thread-stack.c')
-rw-r--r--tools/perf/util/thread-stack.c30
1 files changed, 28 insertions, 2 deletions
diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
index 7f8eff018c16..f52c0f90915d 100644
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -38,6 +38,7 @@
38 * @cp: call path 38 * @cp: call path
39 * @no_call: a 'call' was not seen 39 * @no_call: a 'call' was not seen
40 * @trace_end: a 'call' but trace ended 40 * @trace_end: a 'call' but trace ended
41 * @non_call: a branch but not a 'call' to the start of a different symbol
41 */ 42 */
42struct thread_stack_entry { 43struct thread_stack_entry {
43 u64 ret_addr; 44 u64 ret_addr;
@@ -47,6 +48,7 @@ struct thread_stack_entry {
47 struct call_path *cp; 48 struct call_path *cp;
48 bool no_call; 49 bool no_call;
49 bool trace_end; 50 bool trace_end;
51 bool non_call;
50}; 52};
51 53
52/** 54/**
@@ -268,6 +270,8 @@ static int thread_stack__call_return(struct thread *thread,
268 cr.flags |= CALL_RETURN_NO_CALL; 270 cr.flags |= CALL_RETURN_NO_CALL;
269 if (no_return) 271 if (no_return)
270 cr.flags |= CALL_RETURN_NO_RETURN; 272 cr.flags |= CALL_RETURN_NO_RETURN;
273 if (tse->non_call)
274 cr.flags |= CALL_RETURN_NON_CALL;
271 275
272 return crp->process(&cr, crp->data); 276 return crp->process(&cr, crp->data);
273} 277}
@@ -510,6 +514,7 @@ static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
510 tse->cp = cp; 514 tse->cp = cp;
511 tse->no_call = no_call; 515 tse->no_call = no_call;
512 tse->trace_end = trace_end; 516 tse->trace_end = trace_end;
517 tse->non_call = false;
513 518
514 return 0; 519 return 0;
515} 520}
@@ -531,14 +536,16 @@ static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts,
531 timestamp, ref, false); 536 timestamp, ref, false);
532 } 537 }
533 538
534 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr) { 539 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
540 !ts->stack[ts->cnt - 1].non_call) {
535 return thread_stack__call_return(thread, ts, --ts->cnt, 541 return thread_stack__call_return(thread, ts, --ts->cnt,
536 timestamp, ref, false); 542 timestamp, ref, false);
537 } else { 543 } else {
538 size_t i = ts->cnt - 1; 544 size_t i = ts->cnt - 1;
539 545
540 while (i--) { 546 while (i--) {
541 if (ts->stack[i].ret_addr != ret_addr) 547 if (ts->stack[i].ret_addr != ret_addr ||
548 ts->stack[i].non_call)
542 continue; 549 continue;
543 i += 1; 550 i += 1;
544 while (ts->cnt > i) { 551 while (ts->cnt > i) {
@@ -757,6 +764,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
757 err = thread_stack__trace_begin(thread, ts, sample->time, ref); 764 err = thread_stack__trace_begin(thread, ts, sample->time, ref);
758 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { 765 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
759 err = thread_stack__trace_end(ts, sample, ref); 766 err = thread_stack__trace_end(ts, sample, ref);
767 } else if (sample->flags & PERF_IP_FLAG_BRANCH &&
768 from_al->sym != to_al->sym && to_al->sym &&
769 to_al->addr == to_al->sym->start) {
770 struct call_path_root *cpr = ts->crp->cpr;
771 struct call_path *cp;
772
773 /*
774 * The compiler might optimize a call/ret combination by making
775 * it a jmp. Make that visible by recording on the stack a
776 * branch to the start of a different symbol. Note, that means
777 * when a ret pops the stack, all jmps must be popped off first.
778 */
779 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
780 to_al->sym, sample->addr,
781 ts->kernel_start);
782 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
783 false);
784 if (!err)
785 ts->stack[ts->cnt - 1].non_call = true;
760 } 786 }
761 787
762 return err; 788 return err;