perf tools: Construct LBR call chain

LBR call stack only has user-space callchains. It is output in the PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's still in the form of PERF_SAMPLE_CALLCHAIN. The perf tool has to handle both data sources to construct a complete callstack. For the "perf report -D" option, both lbr and fp information will be displayed. A new call chain recording option "lbr" is introduced into the perf tool for LBR call stack. The user can use --call-graph lbr to get the call stack information from hardware. Here are some examples. When profiling bc(1) on Fedora 19: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd If enabling LBR, perf report output looks like: 50.36% bc bc [.] bc_divide | --- bc_divide execute run_code yyparse main __libc_start_main _start 33.66% bc bc [.] _one_mult | --- _one_mult bc_divide execute run_code yyparse main __libc_start_main _start 7.62% bc bc [.] _bc_do_add | --- _bc_do_add | |--99.89%-- 0x2000186a8 --0.11%-- [...] 6.83% bc bc [.] _bc_do_sub | --- _bc_do_sub | |--99.94%-- bc_add | execute | run_code | yyparse | main | __libc_start_main | _start --0.06%-- [...] 0.46% bc libc-2.17.so [.] __memset_sse2 | --- __memset_sse2 | |--54.13%-- bc_new_num | | | |--51.00%-- bc_divide | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | |--30.46%-- _bc_do_sub | | bc_add | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | --18.55%-- _bc_do_add | bc_add | execute | run_code | yyparse | main | __libc_start_main | _start | --45.87%-- bc_divide execute run_code yyparse main __libc_start_main _start If using FP, perf report output looks like: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd 50.49% bc bc [.] bc_divide | --- bc_divide 33.57% bc bc [.] _one_mult | --- _one_mult 7.61% bc bc [.] _bc_do_add | --- _bc_do_add 0x2000186a8 6.88% bc bc [.] _bc_do_sub | --- _bc_do_sub 0.42% bc libc-2.17.so [.] __memcpy_ssse3_back | --- __memcpy_ssse3_back If using LBR, perf report -D output looks like: 3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0 ... LBR call chain: nr:8 ..... 0: fffffffffffffe00 ..... 1: 0000000000408e50 ..... 2: 000000000040a458 ..... 3: 000000000040562e ..... 4: 0000000000408590 ..... 5: 00000000004022c0 ..... 6: 00000000004015dd ..... 7: 0000003d1cc21b43 ... FP chain: nr:2 ..... 0: fffffffffffffe00 ..... 1: 0000000000408ea8 ... thread: bc:9748 ...... dso: /usr/bin/bc The LBR call stack has the following known limitations: - Zero length calls are not filtered out by the hardware - Exception handing such as setjmp/longjmp will have calls/returns not match - Pushing different return address onto the stack will have calls/returns not match - If callstack is deeper than the LBR, only the last entries are captured Tested-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Kan Liang <kan.liang@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Borislav Petkov <bp@suse.de> Cc: David Ahern <dsahern@gmail.com> Cc: Don Zickus <dzickus@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Simon Que <sque@chromium.org> Cc: Stephane Eranian <eranian@google.com> Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Kan Liang <kan.liang@intel.com> 2015-01-05 13:23:05 -0500
committer: Ingo Molnar <mingo@kernel.org> 2015-02-18 11:16:18 -0500
commit: 384b60557b5522fcb99646f0eb6e7a344cdb94c6 (patch)
tree: 20da511149037cf1066c4f76ff9d84963bec12d0 /tools/perf/util/session.c
parent: aad2b21c151273fa7abc419dac51a980eff1dd17 (diff)
1 files changed, 58 insertions, 6 deletions
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0baf75f12b7c..504b7e664e6c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session *s, union perf_event *event,
        return 0;
 }
-static void callchain__printf(struct perf_sample *sample)
+static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 {
+        struct ip_callchain *callchain = sample->callchain;
+        struct branch_stack *lbr_stack = sample->branch_stack;
+        u64 kernel_callchain_nr = callchain->nr;
        unsigned int i;
-        printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr);
+        for (i = 0; i < kernel_callchain_nr; i++) {
+                if (callchain->ips[i] == PERF_CONTEXT_USER)
+                        break;
+        }
+        if ((i != kernel_callchain_nr) && lbr_stack->nr) {
+                u64 total_nr;
+                /*
+                 * LBR callstack can only get user call chain,
+                 * i is kernel call chain number,
+                 * 1 is PERF_CONTEXT_USER.
+                 *
+                 * The user call chain is stored in LBR registers.
+                 * LBR are pair registers. The caller is stored
+                 * in "from" register, while the callee is stored
+                 * in "to" register.
+                 * For example, there is a call stack
+                 * "A"->"B"->"C"->"D".
+                 * The LBR registers will recorde like
+                 * "C"->"D", "B"->"C", "A"->"B".
+                 * So only the first "to" register and all "from"
+                 * registers are needed to construct the whole stack.
+                 */
+                total_nr = i + 1 + lbr_stack->nr + 1;
+                kernel_callchain_nr = i + 1;
+                printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
+                for (i = 0; i < kernel_callchain_nr; i++)
+                        printf("..... %2d: %016" PRIx64 "\n",
+                               i, callchain->ips[i]);
+                printf("..... %2d: %016" PRIx64 "\n",
+                       (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
+                for (i = 0; i < lbr_stack->nr; i++)
+                        printf("..... %2d: %016" PRIx64 "\n",
+                               (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
+        }
+}
+static void callchain__printf(struct perf_evsel *evsel,
+                              struct perf_sample *sample)
+{
+        unsigned int i;
+        struct ip_callchain *callchain = sample->callchain;
+        if (has_branch_callstack(evsel))
+                callchain__lbr_callstack_printf(sample);
+        printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
-        for (i = 0; i < sample->callchain->nr; i++)
+        for (i = 0; i < callchain->nr; i++)
                printf("..... %2d: %016" PRIx64 "\n",
-                       i, sample->callchain->ips[i]);
+                       i, callchain->ips[i]);
 }
 static void branch_stack__printf(struct perf_sample *sample)
@@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
        sample_type = evsel->attr.sample_type;
        if (sample_type & PERF_SAMPLE_CALLCHAIN)
-                callchain__printf(sample);
+                callchain__printf(evsel, sample);
-        if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+        if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
                branch_stack__printf(sample);
        if (sample_type & PERF_SAMPLE_REGS_USER)
author	Kan Liang <kan.liang@intel.com>	2015-01-05 13:23:05 -0500
committer	Ingo Molnar <mingo@kernel.org>	2015-02-18 11:16:18 -0500
commit	384b60557b5522fcb99646f0eb6e7a344cdb94c6 (patch)
tree	20da511149037cf1066c4f76ff9d84963bec12d0 /tools/perf/util/session.c
parent	aad2b21c151273fa7abc419dac51a980eff1dd17 (diff)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0baf75f12b7c..504b7e664e6c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c
@@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session s, union perf_event event,
553	return 0;	553	return 0;
554	}	554	}
555		555
556	static void callchain__printf(struct perf_sample *sample)	556	static void callchain__lbr_callstack_printf(struct perf_sample *sample)
557	{	557	{
		558	struct ip_callchain *callchain = sample->callchain;
		559	struct branch_stack *lbr_stack = sample->branch_stack;
		560	u64 kernel_callchain_nr = callchain->nr;
558	unsigned int i;	561	unsigned int i;
559		562
560	printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr);	563	for (i = 0; i < kernel_callchain_nr; i++) {
		564	if (callchain->ips[i] == PERF_CONTEXT_USER)
		565	break;
		566	}
		567
		568	if ((i != kernel_callchain_nr) && lbr_stack->nr) {
		569	u64 total_nr;
		570	/*
		571	* LBR callstack can only get user call chain,
		572	* i is kernel call chain number,
		573	* 1 is PERF_CONTEXT_USER.
		574	*
		575	* The user call chain is stored in LBR registers.
		576	* LBR are pair registers. The caller is stored
		577	* in "from" register, while the callee is stored
		578	* in "to" register.
		579	* For example, there is a call stack
		580	* "A"->"B"->"C"->"D".
		581	* The LBR registers will recorde like
		582	* "C"->"D", "B"->"C", "A"->"B".
		583	* So only the first "to" register and all "from"
		584	* registers are needed to construct the whole stack.
		585	*/
		586	total_nr = i + 1 + lbr_stack->nr + 1;
		587	kernel_callchain_nr = i + 1;
		588
		589	printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
		590
		591	for (i = 0; i < kernel_callchain_nr; i++)
		592	printf("..... %2d: %016" PRIx64 "\n",
		593	i, callchain->ips[i]);
		594
		595	printf("..... %2d: %016" PRIx64 "\n",
		596	(int)(kernel_callchain_nr), lbr_stack->entries[0].to);
		597	for (i = 0; i < lbr_stack->nr; i++)
		598	printf("..... %2d: %016" PRIx64 "\n",
		599	(int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
		600	}
		601	}
		602
		603	static void callchain__printf(struct perf_evsel *evsel,
		604	struct perf_sample *sample)
		605	{
		606	unsigned int i;
		607	struct ip_callchain *callchain = sample->callchain;
		608
		609	if (has_branch_callstack(evsel))
		610	callchain__lbr_callstack_printf(sample);
		611
		612	printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
561		613
562	for (i = 0; i < sample->callchain->nr; i++)	614	for (i = 0; i < callchain->nr; i++)
563	printf("..... %2d: %016" PRIx64 "\n",	615	printf("..... %2d: %016" PRIx64 "\n",
564	i, sample->callchain->ips[i]);	616	i, callchain->ips[i]);
565	}	617	}
566		618
567	static void branch_stack__printf(struct perf_sample *sample)	619	static void branch_stack__printf(struct perf_sample *sample)
@@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel evsel, union perf_event event,
718	sample_type = evsel->attr.sample_type;	770	sample_type = evsel->attr.sample_type;
719		771
720	if (sample_type & PERF_SAMPLE_CALLCHAIN)	772	if (sample_type & PERF_SAMPLE_CALLCHAIN)
721	callchain__printf(sample);	773	callchain__printf(evsel, sample);
722		774
723	if (sample_type & PERF_SAMPLE_BRANCH_STACK)	775	if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
724	branch_stack__printf(sample);	776	branch_stack__printf(sample);
725		777
726	if (sample_type & PERF_SAMPLE_REGS_USER)	778	if (sample_type & PERF_SAMPLE_REGS_USER)