aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2014-11-12 21:05:20 -0500
committerArnaldo Carvalho de Melo <acme@redhat.com>2014-12-01 18:00:31 -0500
commit8b7bad58efb7e3aaff60f7c1fa4361fb8c23181d (patch)
treec39e7c6d25e0b88c22b3333d925e641437f5a91b
parent6c0345b73b970078c3e71ecc614a007207a1428a (diff)
perf callchain: Support handling complete branch stacks as histograms
Currently branch stacks can be only shown as edge histograms for individual branches. I never found this display particularly useful. This implements an alternative mode that creates histograms over complete branch traces, instead of individual branches, similar to how normal callgraphs are handled. This is done by putting it in front of the normal callgraph and then using the normal callgraph histogram infrastructure to unify them. This way in complex functions we can understand the control flow that lead to a particular sample, and may even see some control flow in the caller for short functions. Example (simplified, of course for such simple code this is usually not needed), please run this after the whole patchkit is in, as at this point in the patch order there is no --branch-history, that will be added in a patch after this one: tcall.c: volatile a = 10000, b = 100000, c; __attribute__((noinline)) f2() { c = a / b; } __attribute__((noinline)) f1() { f2(); f2(); } main() { int i; for (i = 0; i < 1000000; i++) f1(); } % perf record -b -g ./tsrc/tcall [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.044 MB perf.data (~1923 samples) ] % perf report --no-children --branch-history ... 54.91% tcall.c:6 [.] f2 tcall | |--65.53%-- f2 tcall.c:5 | | | |--70.83%-- f1 tcall.c:11 | | f1 tcall.c:10 | | main tcall.c:18 | | main tcall.c:18 | | main tcall.c:17 | | main tcall.c:17 | | f1 tcall.c:13 | | f1 tcall.c:13 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:12 | | f1 tcall.c:12 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:11 | | | --29.17%-- f1 tcall.c:12 | f1 tcall.c:12 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:11 | f1 tcall.c:10 | main tcall.c:18 | main tcall.c:18 | main tcall.c:17 | main tcall.c:17 | f1 tcall.c:13 | f1 tcall.c:13 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:12 The default output is unchanged. This is only implemented in perf report, no change to record or anywhere else. This adds the basic code to report: - add a new "branch" option to the -g option parser to enable this mode - when the flag is set include the LBR into the callstack in machine.c. The rest of the history code is unchanged and doesn't know the difference between LBR entry and normal call entry. - detect overlaps with the callchain - remove small loop duplicates in the LBR Current limitations: - The LBR flags (mispredict etc.) are not shown in the history and LBR entries have no special marker. - It would be nice if annotate marked the LBR entries somehow (e.g. with arrows) v2: Various fixes. v3: Merge further patches into this one. Fix white space. v4: Improve manpage. Address review feedback. v5: Rename functions. Better error message without -g. Fix crash without -b. v6: Rebase v7: Rebase. Use NO_ENTRY in memset. v8: Port to latest tip. Move add_callchain_ip to separate patch. Skip initial entries in callchain. Minor cleanups. Signed-off-by: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1415844328-4884-3-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
-rw-r--r--tools/perf/Documentation/perf-report.txt7
-rw-r--r--tools/perf/builtin-report.c4
-rw-r--r--tools/perf/util/callchain.c4
-rw-r--r--tools/perf/util/callchain.h1
-rw-r--r--tools/perf/util/machine.c126
-rw-r--r--tools/perf/util/symbol.h3
6 files changed, 132 insertions, 13 deletions
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 0927bf4e6c2a..22706beffabc 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -159,7 +159,7 @@ OPTIONS
159--dump-raw-trace:: 159--dump-raw-trace::
160 Dump raw trace in ASCII. 160 Dump raw trace in ASCII.
161 161
162-g [type,min[,limit],order[,key]]:: 162-g [type,min[,limit],order[,key][,branch]]::
163--call-graph:: 163--call-graph::
164 Display call chains using type, min percent threshold, optional print 164 Display call chains using type, min percent threshold, optional print
165 limit and order. 165 limit and order.
@@ -177,6 +177,11 @@ OPTIONS
177 - function: compare on functions 177 - function: compare on functions
178 - address: compare on individual code addresses 178 - address: compare on individual code addresses
179 179
180 branch can be:
181 - branch: include last branch information in callgraph
182 when available. Usually more convenient to use --branch-history
183 for this.
184
180 Default: fractal,0.5,callee,function. 185 Default: fractal,0.5,callee,function.
181 186
182--children:: 187--children::
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 140a6cd88351..410d44fac64f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -637,8 +637,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
637 "regex filter to identify parent, see: '--sort parent'"), 637 "regex filter to identify parent, see: '--sort parent'"),
638 OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, 638 OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
639 "Only display entries with parent-match"), 639 "Only display entries with parent-match"),
640 OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order", 640 OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order[,branch]",
641 "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). " 641 "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address), add branches. "
642 "Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt), 642 "Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
643 OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain, 643 OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
644 "Accumulate callchains of children and show total overhead as well"), 644 "Accumulate callchains of children and show total overhead as well"),
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 517ed84db97a..cf524a35cc84 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value)
149 callchain_param.key = CCKEY_ADDRESS; 149 callchain_param.key = CCKEY_ADDRESS;
150 return 0; 150 return 0;
151 } 151 }
152 if (!strncmp(value, "branch", strlen(value))) {
153 callchain_param.branch_callstack = 1;
154 return 0;
155 }
152 return -1; 156 return -1;
153} 157}
154 158
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 3f158474c892..dbc08cf5f970 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -63,6 +63,7 @@ struct callchain_param {
63 sort_chain_func_t sort; 63 sort_chain_func_t sort;
64 enum chain_order order; 64 enum chain_order order;
65 enum chain_key key; 65 enum chain_key key;
66 bool branch_callstack;
66}; 67};
67 68
68extern struct callchain_param callchain_param; 69extern struct callchain_param callchain_param;
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b75b487574c7..15dd0a9691ce 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -12,6 +12,7 @@
12#include <stdbool.h> 12#include <stdbool.h>
13#include <symbol/kallsyms.h> 13#include <symbol/kallsyms.h>
14#include "unwind.h" 14#include "unwind.h"
15#include "linux/hash.h"
15 16
16static void dsos__init(struct dsos *dsos) 17static void dsos__init(struct dsos *dsos)
17{ 18{
@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
1391 1392
1392 al.filtered = 0; 1393 al.filtered = 0;
1393 al.sym = NULL; 1394 al.sym = NULL;
1394 thread__find_addr_location(thread, cpumode, MAP__FUNCTION, 1395 if (cpumode == -1)
1396 thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
1397 ip, &al);
1398 else
1399 thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1395 ip, &al); 1400 ip, &al);
1396 if (al.sym != NULL) { 1401 if (al.sym != NULL) {
1397 if (sort__has_parent && !*parent && 1402 if (sort__has_parent && !*parent &&
@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
1427 return bi; 1432 return bi;
1428} 1433}
1429 1434
1435#define CHASHSZ 127
1436#define CHASHBITS 7
1437#define NO_ENTRY 0xff
1438
1439#define PERF_MAX_BRANCH_DEPTH 127
1440
1441/* Remove loops. */
1442static int remove_loops(struct branch_entry *l, int nr)
1443{
1444 int i, j, off;
1445 unsigned char chash[CHASHSZ];
1446
1447 memset(chash, NO_ENTRY, sizeof(chash));
1448
1449 BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
1450
1451 for (i = 0; i < nr; i++) {
1452 int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
1453
1454 /* no collision handling for now */
1455 if (chash[h] == NO_ENTRY) {
1456 chash[h] = i;
1457 } else if (l[chash[h]].from == l[i].from) {
1458 bool is_loop = true;
1459 /* check if it is a real loop */
1460 off = 0;
1461 for (j = chash[h]; j < i && i + off < nr; j++, off++)
1462 if (l[j].from != l[i + off].from) {
1463 is_loop = false;
1464 break;
1465 }
1466 if (is_loop) {
1467 memmove(l + i, l + i + off,
1468 (nr - (i + off)) * sizeof(*l));
1469 nr -= off;
1470 }
1471 }
1472 }
1473 return nr;
1474}
1475
1430static int thread__resolve_callchain_sample(struct thread *thread, 1476static int thread__resolve_callchain_sample(struct thread *thread,
1431 struct ip_callchain *chain, 1477 struct ip_callchain *chain,
1478 struct branch_stack *branch,
1432 struct symbol **parent, 1479 struct symbol **parent,
1433 struct addr_location *root_al, 1480 struct addr_location *root_al,
1434 int max_stack) 1481 int max_stack)
@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
1438 int i; 1485 int i;
1439 int j; 1486 int j;
1440 int err; 1487 int err;
1441 int skip_idx __maybe_unused; 1488 int skip_idx = -1;
1489 int first_call = 0;
1490
1491 /*
1492 * Based on DWARF debug information, some architectures skip
1493 * a callchain entry saved by the kernel.
1494 */
1495 if (chain->nr < PERF_MAX_STACK_DEPTH)
1496 skip_idx = arch_skip_callchain_idx(thread, chain);
1442 1497
1443 callchain_cursor_reset(&callchain_cursor); 1498 callchain_cursor_reset(&callchain_cursor);
1444 1499
1500 /*
1501 * Add branches to call stack for easier browsing. This gives
1502 * more context for a sample than just the callers.
1503 *
1504 * This uses individual histograms of paths compared to the
1505 * aggregated histograms the normal LBR mode uses.
1506 *
1507 * Limitations for now:
1508 * - No extra filters
1509 * - No annotations (should annotate somehow)
1510 */
1511
1512 if (branch && callchain_param.branch_callstack) {
1513 int nr = min(max_stack, (int)branch->nr);
1514 struct branch_entry be[nr];
1515
1516 if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
1517 pr_warning("corrupted branch chain. skipping...\n");
1518 goto check_calls;
1519 }
1520
1521 for (i = 0; i < nr; i++) {
1522 if (callchain_param.order == ORDER_CALLEE) {
1523 be[i] = branch->entries[i];
1524 /*
1525 * Check for overlap into the callchain.
1526 * The return address is one off compared to
1527 * the branch entry. To adjust for this
1528 * assume the calling instruction is not longer
1529 * than 8 bytes.
1530 */
1531 if (i == skip_idx ||
1532 chain->ips[first_call] >= PERF_CONTEXT_MAX)
1533 first_call++;
1534 else if (be[i].from < chain->ips[first_call] &&
1535 be[i].from >= chain->ips[first_call] - 8)
1536 first_call++;
1537 } else
1538 be[i] = branch->entries[branch->nr - i - 1];
1539 }
1540
1541 nr = remove_loops(be, nr);
1542
1543 for (i = 0; i < nr; i++) {
1544 err = add_callchain_ip(thread, parent, root_al,
1545 -1, be[i].to);
1546 if (!err)
1547 err = add_callchain_ip(thread, parent, root_al,
1548 -1, be[i].from);
1549 if (err == -EINVAL)
1550 break;
1551 if (err)
1552 return err;
1553 }
1554 chain_nr -= nr;
1555 }
1556
1557check_calls:
1445 if (chain->nr > PERF_MAX_STACK_DEPTH) { 1558 if (chain->nr > PERF_MAX_STACK_DEPTH) {
1446 pr_warning("corrupted callchain. skipping...\n"); 1559 pr_warning("corrupted callchain. skipping...\n");
1447 return 0; 1560 return 0;
1448 } 1561 }
1449 1562
1450 /* 1563 for (i = first_call; i < chain_nr; i++) {
1451 * Based on DWARF debug information, some architectures skip
1452 * a callchain entry saved by the kernel.
1453 */
1454 skip_idx = arch_skip_callchain_idx(thread, chain);
1455
1456 for (i = 0; i < chain_nr; i++) {
1457 u64 ip; 1564 u64 ip;
1458 1565
1459 if (callchain_param.order == ORDER_CALLEE) 1566 if (callchain_param.order == ORDER_CALLEE)
@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
1517 int max_stack) 1624 int max_stack)
1518{ 1625{
1519 int ret = thread__resolve_callchain_sample(thread, sample->callchain, 1626 int ret = thread__resolve_callchain_sample(thread, sample->callchain,
1627 sample->branch_stack,
1520 parent, root_al, max_stack); 1628 parent, root_al, max_stack);
1521 if (ret) 1629 if (ret)
1522 return ret; 1630 return ret;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index e0b297c50f9d..9d602e9c6f59 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -102,7 +102,8 @@ struct symbol_conf {
102 demangle, 102 demangle,
103 demangle_kernel, 103 demangle_kernel,
104 filter_relative, 104 filter_relative,
105 show_hist_headers; 105 show_hist_headers,
106 branch_callstack;
106 const char *vmlinux_name, 107 const char *vmlinux_name,
107 *kallsyms_name, 108 *kallsyms_name,
108 *source_prefix, 109 *source_prefix,