aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/util/machine.c
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2014-11-12 21:05:20 -0500
committerArnaldo Carvalho de Melo <acme@redhat.com>2014-12-01 18:00:31 -0500
commit8b7bad58efb7e3aaff60f7c1fa4361fb8c23181d (patch)
treec39e7c6d25e0b88c22b3333d925e641437f5a91b /tools/perf/util/machine.c
parent6c0345b73b970078c3e71ecc614a007207a1428a (diff)
perf callchain: Support handling complete branch stacks as histograms
Currently branch stacks can be only shown as edge histograms for individual branches. I never found this display particularly useful. This implements an alternative mode that creates histograms over complete branch traces, instead of individual branches, similar to how normal callgraphs are handled. This is done by putting it in front of the normal callgraph and then using the normal callgraph histogram infrastructure to unify them. This way in complex functions we can understand the control flow that lead to a particular sample, and may even see some control flow in the caller for short functions. Example (simplified, of course for such simple code this is usually not needed), please run this after the whole patchkit is in, as at this point in the patch order there is no --branch-history, that will be added in a patch after this one: tcall.c: volatile a = 10000, b = 100000, c; __attribute__((noinline)) f2() { c = a / b; } __attribute__((noinline)) f1() { f2(); f2(); } main() { int i; for (i = 0; i < 1000000; i++) f1(); } % perf record -b -g ./tsrc/tcall [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.044 MB perf.data (~1923 samples) ] % perf report --no-children --branch-history ... 54.91% tcall.c:6 [.] f2 tcall | |--65.53%-- f2 tcall.c:5 | | | |--70.83%-- f1 tcall.c:11 | | f1 tcall.c:10 | | main tcall.c:18 | | main tcall.c:18 | | main tcall.c:17 | | main tcall.c:17 | | f1 tcall.c:13 | | f1 tcall.c:13 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:12 | | f1 tcall.c:12 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:11 | | | --29.17%-- f1 tcall.c:12 | f1 tcall.c:12 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:11 | f1 tcall.c:10 | main tcall.c:18 | main tcall.c:18 | main tcall.c:17 | main tcall.c:17 | f1 tcall.c:13 | f1 tcall.c:13 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:12 The default output is unchanged. This is only implemented in perf report, no change to record or anywhere else. This adds the basic code to report: - add a new "branch" option to the -g option parser to enable this mode - when the flag is set include the LBR into the callstack in machine.c. The rest of the history code is unchanged and doesn't know the difference between LBR entry and normal call entry. - detect overlaps with the callchain - remove small loop duplicates in the LBR Current limitations: - The LBR flags (mispredict etc.) are not shown in the history and LBR entries have no special marker. - It would be nice if annotate marked the LBR entries somehow (e.g. with arrows) v2: Various fixes. v3: Merge further patches into this one. Fix white space. v4: Improve manpage. Address review feedback. v5: Rename functions. Better error message without -g. Fix crash without -b. v6: Rebase v7: Rebase. Use NO_ENTRY in memset. v8: Port to latest tip. Move add_callchain_ip to separate patch. Skip initial entries in callchain. Minor cleanups. Signed-off-by: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1415844328-4884-3-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/util/machine.c')
-rw-r--r--tools/perf/util/machine.c126
1 files changed, 117 insertions, 9 deletions
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b75b487574c7..15dd0a9691ce 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -12,6 +12,7 @@
12#include <stdbool.h> 12#include <stdbool.h>
13#include <symbol/kallsyms.h> 13#include <symbol/kallsyms.h>
14#include "unwind.h" 14#include "unwind.h"
15#include "linux/hash.h"
15 16
16static void dsos__init(struct dsos *dsos) 17static void dsos__init(struct dsos *dsos)
17{ 18{
@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
1391 1392
1392 al.filtered = 0; 1393 al.filtered = 0;
1393 al.sym = NULL; 1394 al.sym = NULL;
1394 thread__find_addr_location(thread, cpumode, MAP__FUNCTION, 1395 if (cpumode == -1)
1396 thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
1397 ip, &al);
1398 else
1399 thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1395 ip, &al); 1400 ip, &al);
1396 if (al.sym != NULL) { 1401 if (al.sym != NULL) {
1397 if (sort__has_parent && !*parent && 1402 if (sort__has_parent && !*parent &&
@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
1427 return bi; 1432 return bi;
1428} 1433}
1429 1434
1435#define CHASHSZ 127
1436#define CHASHBITS 7
1437#define NO_ENTRY 0xff
1438
1439#define PERF_MAX_BRANCH_DEPTH 127
1440
1441/* Remove loops. */
1442static int remove_loops(struct branch_entry *l, int nr)
1443{
1444 int i, j, off;
1445 unsigned char chash[CHASHSZ];
1446
1447 memset(chash, NO_ENTRY, sizeof(chash));
1448
1449 BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
1450
1451 for (i = 0; i < nr; i++) {
1452 int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
1453
1454 /* no collision handling for now */
1455 if (chash[h] == NO_ENTRY) {
1456 chash[h] = i;
1457 } else if (l[chash[h]].from == l[i].from) {
1458 bool is_loop = true;
1459 /* check if it is a real loop */
1460 off = 0;
1461 for (j = chash[h]; j < i && i + off < nr; j++, off++)
1462 if (l[j].from != l[i + off].from) {
1463 is_loop = false;
1464 break;
1465 }
1466 if (is_loop) {
1467 memmove(l + i, l + i + off,
1468 (nr - (i + off)) * sizeof(*l));
1469 nr -= off;
1470 }
1471 }
1472 }
1473 return nr;
1474}
1475
1430static int thread__resolve_callchain_sample(struct thread *thread, 1476static int thread__resolve_callchain_sample(struct thread *thread,
1431 struct ip_callchain *chain, 1477 struct ip_callchain *chain,
1478 struct branch_stack *branch,
1432 struct symbol **parent, 1479 struct symbol **parent,
1433 struct addr_location *root_al, 1480 struct addr_location *root_al,
1434 int max_stack) 1481 int max_stack)
@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
1438 int i; 1485 int i;
1439 int j; 1486 int j;
1440 int err; 1487 int err;
1441 int skip_idx __maybe_unused; 1488 int skip_idx = -1;
1489 int first_call = 0;
1490
1491 /*
1492 * Based on DWARF debug information, some architectures skip
1493 * a callchain entry saved by the kernel.
1494 */
1495 if (chain->nr < PERF_MAX_STACK_DEPTH)
1496 skip_idx = arch_skip_callchain_idx(thread, chain);
1442 1497
1443 callchain_cursor_reset(&callchain_cursor); 1498 callchain_cursor_reset(&callchain_cursor);
1444 1499
1500 /*
1501 * Add branches to call stack for easier browsing. This gives
1502 * more context for a sample than just the callers.
1503 *
1504 * This uses individual histograms of paths compared to the
1505 * aggregated histograms the normal LBR mode uses.
1506 *
1507 * Limitations for now:
1508 * - No extra filters
1509 * - No annotations (should annotate somehow)
1510 */
1511
1512 if (branch && callchain_param.branch_callstack) {
1513 int nr = min(max_stack, (int)branch->nr);
1514 struct branch_entry be[nr];
1515
1516 if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
1517 pr_warning("corrupted branch chain. skipping...\n");
1518 goto check_calls;
1519 }
1520
1521 for (i = 0; i < nr; i++) {
1522 if (callchain_param.order == ORDER_CALLEE) {
1523 be[i] = branch->entries[i];
1524 /*
1525 * Check for overlap into the callchain.
1526 * The return address is one off compared to
1527 * the branch entry. To adjust for this
1528 * assume the calling instruction is not longer
1529 * than 8 bytes.
1530 */
1531 if (i == skip_idx ||
1532 chain->ips[first_call] >= PERF_CONTEXT_MAX)
1533 first_call++;
1534 else if (be[i].from < chain->ips[first_call] &&
1535 be[i].from >= chain->ips[first_call] - 8)
1536 first_call++;
1537 } else
1538 be[i] = branch->entries[branch->nr - i - 1];
1539 }
1540
1541 nr = remove_loops(be, nr);
1542
1543 for (i = 0; i < nr; i++) {
1544 err = add_callchain_ip(thread, parent, root_al,
1545 -1, be[i].to);
1546 if (!err)
1547 err = add_callchain_ip(thread, parent, root_al,
1548 -1, be[i].from);
1549 if (err == -EINVAL)
1550 break;
1551 if (err)
1552 return err;
1553 }
1554 chain_nr -= nr;
1555 }
1556
1557check_calls:
1445 if (chain->nr > PERF_MAX_STACK_DEPTH) { 1558 if (chain->nr > PERF_MAX_STACK_DEPTH) {
1446 pr_warning("corrupted callchain. skipping...\n"); 1559 pr_warning("corrupted callchain. skipping...\n");
1447 return 0; 1560 return 0;
1448 } 1561 }
1449 1562
1450 /* 1563 for (i = first_call; i < chain_nr; i++) {
1451 * Based on DWARF debug information, some architectures skip
1452 * a callchain entry saved by the kernel.
1453 */
1454 skip_idx = arch_skip_callchain_idx(thread, chain);
1455
1456 for (i = 0; i < chain_nr; i++) {
1457 u64 ip; 1564 u64 ip;
1458 1565
1459 if (callchain_param.order == ORDER_CALLEE) 1566 if (callchain_param.order == ORDER_CALLEE)
@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
1517 int max_stack) 1624 int max_stack)
1518{ 1625{
1519 int ret = thread__resolve_callchain_sample(thread, sample->callchain, 1626 int ret = thread__resolve_callchain_sample(thread, sample->callchain,
1627 sample->branch_stack,
1520 parent, root_al, max_stack); 1628 parent, root_al, max_stack);
1521 if (ret) 1629 if (ret)
1522 return ret; 1630 return ret;