perf callchain: Support handling complete branch stacks as histograms

Currently branch stacks can be only shown as edge histograms for individual branches. I never found this display particularly useful. This implements an alternative mode that creates histograms over complete branch traces, instead of individual branches, similar to how normal callgraphs are handled. This is done by putting it in front of the normal callgraph and then using the normal callgraph histogram infrastructure to unify them. This way in complex functions we can understand the control flow that lead to a particular sample, and may even see some control flow in the caller for short functions. Example (simplified, of course for such simple code this is usually not needed), please run this after the whole patchkit is in, as at this point in the patch order there is no --branch-history, that will be added in a patch after this one: tcall.c: volatile a = 10000, b = 100000, c; __attribute__((noinline)) f2() { c = a / b; } __attribute__((noinline)) f1() { f2(); f2(); } main() { int i; for (i = 0; i < 1000000; i++) f1(); } % perf record -b -g ./tsrc/tcall [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.044 MB perf.data (~1923 samples) ] % perf report --no-children --branch-history ... 54.91% tcall.c:6 [.] f2 tcall | |--65.53%-- f2 tcall.c:5 | | | |--70.83%-- f1 tcall.c:11 | | f1 tcall.c:10 | | main tcall.c:18 | | main tcall.c:18 | | main tcall.c:17 | | main tcall.c:17 | | f1 tcall.c:13 | | f1 tcall.c:13 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:12 | | f1 tcall.c:12 | | f2 tcall.c:7 | | f2 tcall.c:5 | | f1 tcall.c:11 | | | --29.17%-- f1 tcall.c:12 | f1 tcall.c:12 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:11 | f1 tcall.c:10 | main tcall.c:18 | main tcall.c:18 | main tcall.c:17 | main tcall.c:17 | f1 tcall.c:13 | f1 tcall.c:13 | f2 tcall.c:7 | f2 tcall.c:5 | f1 tcall.c:12 The default output is unchanged. This is only implemented in perf report, no change to record or anywhere else. This adds the basic code to report: - add a new "branch" option to the -g option parser to enable this mode - when the flag is set include the LBR into the callstack in machine.c. The rest of the history code is unchanged and doesn't know the difference between LBR entry and normal call entry. - detect overlaps with the callchain - remove small loop duplicates in the LBR Current limitations: - The LBR flags (mispredict etc.) are not shown in the history and LBR entries have no special marker. - It would be nice if annotate marked the LBR entries somehow (e.g. with arrows) v2: Various fixes. v3: Merge further patches into this one. Fix white space. v4: Improve manpage. Address review feedback. v5: Rename functions. Better error message without -g. Fix crash without -b. v6: Rebase v7: Rebase. Use NO_ENTRY in memset. v8: Port to latest tip. Move add_callchain_ip to separate patch. Skip initial entries in callchain. Minor cleanups. Signed-off-by: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1415844328-4884-3-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
author: Andi Kleen <ak@linux.intel.com> 2014-11-12 21:05:20 -0500
committer: Arnaldo Carvalho de Melo <acme@redhat.com> 2014-12-01 18:00:31 -0500
commit: 8b7bad58efb7e3aaff60f7c1fa4361fb8c23181d (patch)
tree: c39e7c6d25e0b88c22b3333d925e641437f5a91b /tools/perf/util/machine.c
parent: 6c0345b73b970078c3e71ecc614a007207a1428a (diff)
1 files changed, 117 insertions, 9 deletions
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b75b487574c7..15dd0a9691ce 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -12,6 +12,7 @@
 #include <stdbool.h>
 #include <symbol/kallsyms.h>
 #include "unwind.h"
+#include "linux/hash.h"
 static void dsos__init(struct dsos *dsos)
 {
@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
        al.filtered = 0;
        al.sym = NULL;
-        thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
+        if (cpumode == -1)
+                thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
+                                                   ip, &al);
+        else
+                thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
                                   ip, &al);
        if (al.sym != NULL) {
                if (sort__has_parent && !*parent &&
@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
        return bi;
 }
+#define CHASHSZ 127
+#define CHASHBITS 7
+#define NO_ENTRY 0xff
+#define PERF_MAX_BRANCH_DEPTH 127
+/* Remove loops. */
+static int remove_loops(struct branch_entry *l, int nr)
+{
+        int i, j, off;
+        unsigned char chash[CHASHSZ];
+        memset(chash, NO_ENTRY, sizeof(chash));
+        BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
+        for (i = 0; i < nr; i++) {
+                int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
+                /* no collision handling for now */
+                if (chash[h] == NO_ENTRY) {
+                        chash[h] = i;
+                } else if (l[chash[h]].from == l[i].from) {
+                        bool is_loop = true;
+                        /* check if it is a real loop */
+                        off = 0;
+                        for (j = chash[h]; j < i && i + off < nr; j++, off++)
+                                if (l[j].from != l[i + off].from) {
+                                        is_loop = false;
+                                        break;
+                                }
+                        if (is_loop) {
+                                memmove(l + i, l + i + off,
+                                        (nr - (i + off)) * sizeof(*l));
+                                nr -= off;
+                        }
+                }
+        }
+        return nr;
+}
 static int thread__resolve_callchain_sample(struct thread *thread,
                                             struct ip_callchain *chain,
+                                             struct branch_stack *branch,
                                             struct symbol **parent,
                                             struct addr_location *root_al,
                                             int max_stack)
@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
        int i;
        int j;
        int err;
-        int skip_idx __maybe_unused;
+        int skip_idx = -1;
+        int first_call = 0;
+        /*
+         * Based on DWARF debug information, some architectures skip
+         * a callchain entry saved by the kernel.
+         */
+        if (chain->nr < PERF_MAX_STACK_DEPTH)
+                skip_idx = arch_skip_callchain_idx(thread, chain);
        callchain_cursor_reset(&callchain_cursor);
+        /*
+         * Add branches to call stack for easier browsing. This gives
+         * more context for a sample than just the callers.
+         *
+         * This uses individual histograms of paths compared to the
+         * aggregated histograms the normal LBR mode uses.
+         *
+         * Limitations for now:
+         * - No extra filters
+         * - No annotations (should annotate somehow)
+         */
+        if (branch && callchain_param.branch_callstack) {
+                int nr = min(max_stack, (int)branch->nr);
+                struct branch_entry be[nr];
+                if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
+                        pr_warning("corrupted branch chain. skipping...\n");
+                        goto check_calls;
+                }
+                for (i = 0; i < nr; i++) {
+                        if (callchain_param.order == ORDER_CALLEE) {
+                                be[i] = branch->entries[i];
+                                /*
+                                 * Check for overlap into the callchain.
+                                 * The return address is one off compared to
+                                 * the branch entry. To adjust for this
+                                 * assume the calling instruction is not longer
+                                 * than 8 bytes.
+                                 */
+                                if (i == skip_idx ||
+                                    chain->ips[first_call] >= PERF_CONTEXT_MAX)
+                                        first_call++;
+                                else if (be[i].from < chain->ips[first_call] &&
+                                    be[i].from >= chain->ips[first_call] - 8)
+                                        first_call++;
+                        } else
+                                be[i] = branch->entries[branch->nr - i - 1];
+                }
+                nr = remove_loops(be, nr);
+                for (i = 0; i < nr; i++) {
+                        err = add_callchain_ip(thread, parent, root_al,
+                                               -1, be[i].to);
+                        if (!err)
+                                err = add_callchain_ip(thread, parent, root_al,
+                                                       -1, be[i].from);
+                        if (err == -EINVAL)
+                                break;
+                        if (err)
+                                return err;
+                }
+                chain_nr -= nr;
+        }
+check_calls:
        if (chain->nr > PERF_MAX_STACK_DEPTH) {
                pr_warning("corrupted callchain. skipping...\n");
                return 0;
        }
-        /*
+        for (i = first_call; i < chain_nr; i++) {
-         * Based on DWARF debug information, some architectures skip
-         * a callchain entry saved by the kernel.
-         */
-        skip_idx = arch_skip_callchain_idx(thread, chain);
-        for (i = 0; i < chain_nr; i++) {
                u64 ip;
                if (callchain_param.order == ORDER_CALLEE)
@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
                              int max_stack)
 {
        int ret = thread__resolve_callchain_sample(thread, sample->callchain,
+                                                   sample->branch_stack,
                                                   parent, root_al, max_stack);
        if (ret)
                return ret;
author	Andi Kleen <ak@linux.intel.com>	2014-11-12 21:05:20 -0500
committer	Arnaldo Carvalho de Melo <acme@redhat.com>	2014-12-01 18:00:31 -0500
commit	8b7bad58efb7e3aaff60f7c1fa4361fb8c23181d (patch)
tree	c39e7c6d25e0b88c22b3333d925e641437f5a91b /tools/perf/util/machine.c
parent	6c0345b73b970078c3e71ecc614a007207a1428a (diff)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index b75b487574c7..15dd0a9691ce 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c
@@ -12,6 +12,7 @@
12	#include <stdbool.h>	12	#include <stdbool.h>
13	#include <symbol/kallsyms.h>	13	#include <symbol/kallsyms.h>
14	#include "unwind.h"	14	#include "unwind.h"
		15	#include "linux/hash.h"
15		16
16	static void dsos__init(struct dsos *dsos)	17	static void dsos__init(struct dsos *dsos)
17	{	18	{
@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
1391		1392
1392	al.filtered = 0;	1393	al.filtered = 0;
1393	al.sym = NULL;	1394	al.sym = NULL;
1394	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,	1395	if (cpumode == -1)
		1396	thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
		1397	ip, &al);
		1398	else
		1399	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1395	ip, &al);	1400	ip, &al);
1396	if (al.sym != NULL) {	1401	if (al.sym != NULL) {
1397	if (sort__has_parent && !*parent &&	1402	if (sort__has_parent && !*parent &&
@@ -1427,8 +1432,50 @@ struct branch_info sample__resolve_bstack(struct perf_sample sample,
1427	return bi;	1432	return bi;
1428	}	1433	}
1429		1434
		1435	#define CHASHSZ 127
		1436	#define CHASHBITS 7
		1437	#define NO_ENTRY 0xff
		1438
		1439	#define PERF_MAX_BRANCH_DEPTH 127
		1440
		1441	/* Remove loops. */
		1442	static int remove_loops(struct branch_entry *l, int nr)
		1443	{
		1444	int i, j, off;
		1445	unsigned char chash[CHASHSZ];
		1446
		1447	memset(chash, NO_ENTRY, sizeof(chash));
		1448
		1449	BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
		1450
		1451	for (i = 0; i < nr; i++) {
		1452	int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
		1453
		1454	/* no collision handling for now */
		1455	if (chash[h] == NO_ENTRY) {
		1456	chash[h] = i;
		1457	} else if (l[chash[h]].from == l[i].from) {
		1458	bool is_loop = true;
		1459	/* check if it is a real loop */
		1460	off = 0;
		1461	for (j = chash[h]; j < i && i + off < nr; j++, off++)
		1462	if (l[j].from != l[i + off].from) {
		1463	is_loop = false;
		1464	break;
		1465	}
		1466	if (is_loop) {
		1467	memmove(l + i, l + i + off,
		1468	(nr - (i + off)) * sizeof(*l));
		1469	nr -= off;
		1470	}
		1471	}
		1472	}
		1473	return nr;
		1474	}
		1475
1430	static int thread__resolve_callchain_sample(struct thread *thread,	1476	static int thread__resolve_callchain_sample(struct thread *thread,
1431	struct ip_callchain *chain,	1477	struct ip_callchain *chain,
		1478	struct branch_stack *branch,
1432	struct symbol **parent,	1479	struct symbol **parent,
1433	struct addr_location *root_al,	1480	struct addr_location *root_al,
1434	int max_stack)	1481	int max_stack)
@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
1438	int i;	1485	int i;
1439	int j;	1486	int j;
1440	int err;	1487	int err;
1441	int skip_idx __maybe_unused;	1488	int skip_idx = -1;
		1489	int first_call = 0;
		1490
		1491	/*
		1492	* Based on DWARF debug information, some architectures skip
		1493	* a callchain entry saved by the kernel.
		1494	*/
		1495	if (chain->nr < PERF_MAX_STACK_DEPTH)
		1496	skip_idx = arch_skip_callchain_idx(thread, chain);
1442		1497
1443	callchain_cursor_reset(&callchain_cursor);	1498	callchain_cursor_reset(&callchain_cursor);
1444		1499
		1500	/*
		1501	* Add branches to call stack for easier browsing. This gives
		1502	* more context for a sample than just the callers.
		1503	*
		1504	* This uses individual histograms of paths compared to the
		1505	* aggregated histograms the normal LBR mode uses.
		1506	*
		1507	* Limitations for now:
		1508	* - No extra filters
		1509	* - No annotations (should annotate somehow)
		1510	*/
		1511
		1512	if (branch && callchain_param.branch_callstack) {
		1513	int nr = min(max_stack, (int)branch->nr);
		1514	struct branch_entry be[nr];
		1515
		1516	if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
		1517	pr_warning("corrupted branch chain. skipping...\n");
		1518	goto check_calls;
		1519	}
		1520
		1521	for (i = 0; i < nr; i++) {
		1522	if (callchain_param.order == ORDER_CALLEE) {
		1523	be[i] = branch->entries[i];
		1524	/*
		1525	* Check for overlap into the callchain.
		1526	* The return address is one off compared to
		1527	* the branch entry. To adjust for this
		1528	* assume the calling instruction is not longer
		1529	* than 8 bytes.
		1530	*/
		1531	if (i == skip_idx \|\|
		1532	chain->ips[first_call] >= PERF_CONTEXT_MAX)
		1533	first_call++;
		1534	else if (be[i].from < chain->ips[first_call] &&
		1535	be[i].from >= chain->ips[first_call] - 8)
		1536	first_call++;
		1537	} else
		1538	be[i] = branch->entries[branch->nr - i - 1];
		1539	}
		1540
		1541	nr = remove_loops(be, nr);
		1542
		1543	for (i = 0; i < nr; i++) {
		1544	err = add_callchain_ip(thread, parent, root_al,
		1545	-1, be[i].to);
		1546	if (!err)
		1547	err = add_callchain_ip(thread, parent, root_al,
		1548	-1, be[i].from);
		1549	if (err == -EINVAL)
		1550	break;
		1551	if (err)
		1552	return err;
		1553	}
		1554	chain_nr -= nr;
		1555	}
		1556
		1557	check_calls:
1445	if (chain->nr > PERF_MAX_STACK_DEPTH) {	1558	if (chain->nr > PERF_MAX_STACK_DEPTH) {
1446	pr_warning("corrupted callchain. skipping...\n");	1559	pr_warning("corrupted callchain. skipping...\n");
1447	return 0;	1560	return 0;
1448	}	1561	}
1449		1562
1450	/*	1563	for (i = first_call; i < chain_nr; i++) {
1451	* Based on DWARF debug information, some architectures skip
1452	* a callchain entry saved by the kernel.
1453	*/
1454	skip_idx = arch_skip_callchain_idx(thread, chain);
1455
1456	for (i = 0; i < chain_nr; i++) {
1457	u64 ip;	1564	u64 ip;
1458		1565
1459	if (callchain_param.order == ORDER_CALLEE)	1566	if (callchain_param.order == ORDER_CALLEE)
@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
1517	int max_stack)	1624	int max_stack)
1518	{	1625	{
1519	int ret = thread__resolve_callchain_sample(thread, sample->callchain,	1626	int ret = thread__resolve_callchain_sample(thread, sample->callchain,
		1627	sample->branch_stack,
1520	parent, root_al, max_stack);	1628	parent, root_al, max_stack);
1521	if (ret)	1629	if (ret)
1522	return ret;	1630	return ret;