aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf
diff options
context:
space:
mode:
authorRoberto Agostino Vitillo <ravitillo@lbl.gov>2012-02-09 17:21:03 -0500
committerIngo Molnar <mingo@elte.hu>2012-03-09 02:26:05 -0500
commitb50311dc2ac1c04ad19163c2359910b25e16caf6 (patch)
tree80a4489b2b268b7512dd4eb566858a6bae8bfffe /tools/perf
parentbdfebd848f2a14e639031a0b0e61d7c7ee5e5fd2 (diff)
perf report: Add support for taken branch sampling
This patch adds support for taken branch sampling, i.e, the PERF_SAMPLE_BRANCH_STACK feature to perf report. In other words, to display histograms based on taken branches rather than executed instructions addresses. The new option is called -b and it takes no argument. To generate meaningful output, the perf.data must have been obtained using perf record -b xxx ... where xxx is a branch filter option. The output shows symbols, modules, sorted by 'who branches where' the most often. The percentages reported in the first column refer to the total number of branches captured and not the usual number of samples. Here is a quick example. Here branchy is simple test program which looks as follows: void f2(void) {} void f3(void) {} void f1(unsigned long n) { if (n & 1UL) f2(); else f3(); } int main(void) { unsigned long i; for (i=0; i < N; i++) f1(i); return 0; } Here is the output captured on Nehalem, if we are only interested in user level function calls. $ perf record -b any_call,u -e cycles:u branchy $ perf report -b --sort=symbol 52.34% [.] main [.] f1 24.04% [.] f1 [.] f3 23.60% [.] f1 [.] f2 0.01% [k] _IO_new_file_xsputn [k] _IO_file_overflow 0.01% [k] _IO_vfprintf_internal [k] _IO_new_file_xsputn 0.01% [k] _IO_vfprintf_internal [k] strchrnul 0.01% [k] __printf [k] _IO_vfprintf_internal 0.01% [k] main [k] __printf About half (52%) of the call branches captured are from main() -> f1(). The second half (24%+23%) is split in two equal shares between f1() -> f2(), f1() ->f3(). The output is as expected given the code. It should be noted, that using -b in perf record does not eliminate information in the perf.data file. Consequently, a typical profile can also be obtained by perf report by simply not using its -b option. It is possible to sort on branch related columns: - dso_from, symbol_from - dso_to, symbol_to - mispredict Signed-off-by: Roberto Agostino Vitillo <ravitillo@lbl.gov> Signed-off-by: Stephane Eranian <eranian@google.com> Cc: peterz@infradead.org Cc: acme@redhat.com Cc: robert.richter@amd.com Cc: ming.m.lin@intel.com Cc: andi@firstfloor.org Cc: asharma@fb.com Cc: vweaver1@eecs.utk.edu Cc: khandual@linux.vnet.ibm.com Cc: dsahern@gmail.com Link: http://lkml.kernel.org/r/1328826068-11713-14-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'tools/perf')
-rw-r--r--tools/perf/Documentation/perf-report.txt7
-rw-r--r--tools/perf/builtin-report.c107
2 files changed, 105 insertions, 9 deletions
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 9b430e98712..19b9092cf8b 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -153,6 +153,13 @@ OPTIONS
153 information which may be very large and thus may clutter the display. 153 information which may be very large and thus may clutter the display.
154 It currently includes: cpu and numa topology of the host system. 154 It currently includes: cpu and numa topology of the host system.
155 155
156-b::
157--branch-stack::
158 Use the addresses of sampled taken branches instead of the instruction
159 address to build the histograms. To generate meaningful output, the
160 perf.data file must have been obtained using perf record -b xxx where
161 xxx is a branch filter option.
162
156SEE ALSO 163SEE ALSO
157-------- 164--------
158linkperf:perf-stat[1], linkperf:perf-annotate[1] 165linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 25d34d483e4..528789f6c70 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -53,6 +53,50 @@ struct perf_report {
53 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 53 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
54}; 54};
55 55
56static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
57 struct addr_location *al,
58 struct perf_sample *sample,
59 struct perf_evsel *evsel,
60 struct machine *machine)
61{
62 struct perf_report *rep = container_of(tool, struct perf_report, tool);
63 struct symbol *parent = NULL;
64 int err = 0;
65 unsigned i;
66 struct hist_entry *he;
67 struct branch_info *bi;
68
69 if ((sort__has_parent || symbol_conf.use_callchain)
70 && sample->callchain) {
71 err = machine__resolve_callchain(machine, evsel, al->thread,
72 sample->callchain, &parent);
73 if (err)
74 return err;
75 }
76
77 bi = machine__resolve_bstack(machine, al->thread,
78 sample->branch_stack);
79 if (!bi)
80 return -ENOMEM;
81
82 for (i = 0; i < sample->branch_stack->nr; i++) {
83 if (rep->hide_unresolved && !(bi[i].from.sym && bi[i].to.sym))
84 continue;
85 /*
86 * The report shows the percentage of total branches captured
87 * and not events sampled. Thus we use a pseudo period of 1.
88 */
89 he = __hists__add_branch_entry(&evsel->hists, al, parent,
90 &bi[i], 1);
91 if (he) {
92 evsel->hists.stats.total_period += 1;
93 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
94 } else
95 return -ENOMEM;
96 }
97 return err;
98}
99
56static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, 100static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
57 struct addr_location *al, 101 struct addr_location *al,
58 struct perf_sample *sample, 102 struct perf_sample *sample,
@@ -126,14 +170,21 @@ static int process_sample_event(struct perf_tool *tool,
126 if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap)) 170 if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
127 return 0; 171 return 0;
128 172
129 if (al.map != NULL) 173 if (sort__branch_mode) {
130 al.map->dso->hit = 1; 174 if (perf_report__add_branch_hist_entry(tool, &al, sample,
175 evsel, machine)) {
176 pr_debug("problem adding lbr entry, skipping event\n");
177 return -1;
178 }
179 } else {
180 if (al.map != NULL)
181 al.map->dso->hit = 1;
131 182
132 if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) { 183 if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) {
133 pr_debug("problem incrementing symbol period, skipping event\n"); 184 pr_debug("problem incrementing symbol period, skipping event\n");
134 return -1; 185 return -1;
186 }
135 } 187 }
136
137 return 0; 188 return 0;
138} 189}
139 190
@@ -188,6 +239,15 @@ static int perf_report__setup_sample_type(struct perf_report *rep)
188 } 239 }
189 } 240 }
190 241
242 if (sort__branch_mode) {
243 if (!(self->sample_type & PERF_SAMPLE_BRANCH_STACK)) {
244 fprintf(stderr, "selected -b but no branch data."
245 " Did you call perf record without"
246 " -b?\n");
247 return -1;
248 }
249 }
250
191 return 0; 251 return 0;
192} 252}
193 253
@@ -477,7 +537,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
477 OPT_BOOLEAN(0, "stdio", &report.use_stdio, 537 OPT_BOOLEAN(0, "stdio", &report.use_stdio,
478 "Use the stdio interface"), 538 "Use the stdio interface"),
479 OPT_STRING('s', "sort", &sort_order, "key[,key2...]", 539 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
480 "sort by key(s): pid, comm, dso, symbol, parent"), 540 "sort by key(s): pid, comm, dso, symbol, parent, dso_to,"
541 " dso_from, symbol_to, symbol_from, mispredict"),
481 OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, 542 OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
482 "Show sample percentage for different cpu modes"), 543 "Show sample percentage for different cpu modes"),
483 OPT_STRING('p', "parent", &parent_pattern, "regex", 544 OPT_STRING('p', "parent", &parent_pattern, "regex",
@@ -517,6 +578,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
517 "Specify disassembler style (e.g. -M intel for intel syntax)"), 578 "Specify disassembler style (e.g. -M intel for intel syntax)"),
518 OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, 579 OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
519 "Show a column with the sum of periods"), 580 "Show a column with the sum of periods"),
581 OPT_BOOLEAN('b', "branch-stack", &sort__branch_mode,
582 "use branch records for histogram filling"),
520 OPT_END() 583 OPT_END()
521 }; 584 };
522 585
@@ -537,10 +600,36 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
537 report.input_name = "perf.data"; 600 report.input_name = "perf.data";
538 } 601 }
539 602
540 if (strcmp(report.input_name, "-") != 0) 603 if (sort__branch_mode) {
604 if (use_browser)
605 fprintf(stderr, "Warning: TUI interface not supported"
606 " in branch mode\n");
607 if (symbol_conf.dso_list_str != NULL)
608 fprintf(stderr, "Warning: dso filtering not supported"
609 " in branch mode\n");
610 if (symbol_conf.sym_list_str != NULL)
611 fprintf(stderr, "Warning: symbol filtering not"
612 " supported in branch mode\n");
613
614 report.use_stdio = true;
615 use_browser = 0;
541 setup_browser(true); 616 setup_browser(true);
542 else 617 symbol_conf.dso_list_str = NULL;
618 symbol_conf.sym_list_str = NULL;
619
620 /*
621 * if no sort_order is provided, then specify branch-mode
622 * specific order
623 */
624 if (sort_order == default_sort_order)
625 sort_order = "comm,dso_from,symbol_from,"
626 "dso_to,symbol_to";
627
628 } else if (strcmp(report.input_name, "-") != 0) {
629 setup_browser(true);
630 } else {
543 use_browser = 0; 631 use_browser = 0;
632 }
544 633
545 /* 634 /*
546 * Only in the newt browser we are doing integrated annotation, 635 * Only in the newt browser we are doing integrated annotation,