9 files changed, 257 insertions, 89 deletions
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 0927bf4e6c2a..dd7cccdde498 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -159,7 +159,7 @@ OPTIONS
 --dump-raw-trace::
        Dump raw trace in ASCII.
-g [type,min[,limit],order[,key]]::
+-g [type,min[,limit],order[,key][,branch]]::
 --call-graph::
        Display call chains using type, min percent threshold, optional print
        limit and order.
@@ -177,6 +177,11 @@ OPTIONS
        - function: compare on functions
        - address: compare on individual code addresses
+        branch can be:
+        - branch: include last branch information in callgraph
+        when available. Usually more convenient to use --branch-history
+        for this.
        Default: fractal,0.5,callee,function.
 --children::
@@ -266,6 +271,11 @@ OPTIONS
        branch stacks and it will automatically switch to the branch view mode,
        unless --no-branch-stack is used.
+--branch-history::
+        Add the addresses of sampled taken branches to the callstack.
+        This allows to examine the path the program took to each sample.
+        The data collection must have used -b (or -j) and -g.
 --objdump=<path>::
        Path to objdump binary.
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 140a6cd88351..39367609c707 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -226,8 +226,9 @@ static int report__setup_sample_type(struct report *rep)
                        return -EINVAL;
                }
                if (symbol_conf.use_callchain) {
-                        ui__error("Selected -g but no callchain data. Did "
+                        ui__error("Selected -g or --branch-history but no "
-                                    "you call 'perf record' without -g?\n");
+                                  "callchain data. Did\n"
+                                  "you call 'perf record' without -g?\n");
                        return -1;
                }
        } else if (!rep->dont_use_callchains &&
@@ -575,6 +576,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
        struct stat st;
        bool has_br_stack = false;
        int branch_mode = -1;
+        bool branch_call_mode = false;
        char callchain_default_opt[] = "fractal,0.5,callee";
        const char * const report_usage[] = {
                "perf report [<options>]",
@@ -637,8 +639,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                   "regex filter to identify parent, see: '--sort parent'"),
        OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
                    "Only display entries with parent-match"),
-        OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
+        OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order[,branch]",
-                     "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
+                     "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address), add branches. "
                     "Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
        OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
                    "Accumulate callchains of children and show total overhead as well"),
@@ -684,7 +686,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
                    "Show event group information together"),
        OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "",
-                    "use branch records for histogram filling", parse_branch_mode),
+                    "use branch records for per branch histogram filling",
+                    parse_branch_mode),
+        OPT_BOOLEAN(0, "branch-history", &branch_call_mode,
+                    "add last branch records to call history"),
        OPT_STRING(0, "objdump", &objdump_path, "path",
                   "objdump binary to use for disassembly and annotations"),
        OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
@@ -745,10 +750,24 @@ repeat:
        has_br_stack = perf_header__has_feat(&session->header,
                                             HEADER_BRANCH_STACK);
-        if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) {
+        /*
+         * Branch mode is a tristate:
+         * -1 means default, so decide based on the file having branch data.
+         * 0/1 means the user chose a mode.
+         */
+        if (((branch_mode == -1 && has_br_stack) || branch_mode == 1) &&
+            branch_call_mode == -1) {
                sort__mode = SORT_MODE__BRANCH;
                symbol_conf.cumulate_callchain = false;
        }
+        if (branch_call_mode) {
+                callchain_param.key = CCKEY_ADDRESS;
+                callchain_param.branch_callstack = 1;
+                symbol_conf.use_callchain = true;
+                callchain_register_param(&callchain_param);
+                if (sort_order == NULL)
+                        sort_order = "srcline,symbol,dso";
+        }
        if (report.mem_mode) {
                if (sort__mode == SORT_MODE__BRANCH) {
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 055ce9232c9e..891086376381 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -388,20 +388,102 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
                update_stats(&runtime_itlb_cache_stats[0], count[0]);
 }
+static void zero_per_pkg(struct perf_evsel *counter)
+{
+        if (counter->per_pkg_mask)
+                memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
+}
+static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
+{
+        unsigned long *mask = counter->per_pkg_mask;
+        struct cpu_map *cpus = perf_evsel__cpus(counter);
+        int s;
+        *skip = false;
+        if (!counter->per_pkg)
+                return 0;
+        if (cpu_map__empty(cpus))
+                return 0;
+        if (!mask) {
+                mask = zalloc(MAX_NR_CPUS);
+                if (!mask)
+                        return -ENOMEM;
+                counter->per_pkg_mask = mask;
+        }
+        s = cpu_map__get_socket(cpus, cpu);
+        if (s < 0)
+                return -1;
+        *skip = test_and_set_bit(s, mask) == 1;
+        return 0;
+}
+static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
+                   struct perf_counts_values *count)
+{
+        struct perf_counts_values *aggr = &evsel->counts->aggr;
+        static struct perf_counts_values zero;
+        bool skip = false;
+        if (check_per_pkg(evsel, cpu, &skip)) {
+                pr_err("failed to read per-pkg counter\n");
+                return -1;
+        }
+        if (skip)
+                count = &zero;
+        switch (aggr_mode) {
+        case AGGR_CORE:
+        case AGGR_SOCKET:
+        case AGGR_NONE:
+                if (!evsel->snapshot)
+                        perf_evsel__compute_deltas(evsel, cpu, count);
+                perf_counts_values__scale(count, scale, NULL);
+                evsel->counts->cpu[cpu] = *count;
+                update_shadow_stats(evsel, count->values);
+                break;
+        case AGGR_GLOBAL:
+                aggr->val += count->val;
+                if (scale) {
+                        aggr->ena += count->ena;
+                        aggr->run += count->run;
+                }
+        default:
+                break;
+        }
+        return 0;
+}
+static int read_counter(struct perf_evsel *counter);
 /*
 * Read out the results of a single counter:
 * aggregate counts across CPUs in system-wide mode
 */
 static int read_counter_aggr(struct perf_evsel *counter)
 {
+        struct perf_counts_values *aggr = &counter->counts->aggr;
        struct perf_stat *ps = counter->priv;
        u64 *count = counter->counts->aggr.values;
        int i;
-        if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter),
+        aggr->val = aggr->ena = aggr->run = 0;
-                               thread_map__nr(evsel_list->threads), scale) < 0)
+        if (read_counter(counter))
                return -1;
+        if (!counter->snapshot)
+                perf_evsel__compute_deltas(counter, -1, aggr);
+        perf_counts_values__scale(aggr, scale, &counter->counts->scaled);
        for (i = 0; i < 3; i++)
                update_stats(&ps->res_stats[i], count[i]);
@@ -424,16 +506,21 @@ static int read_counter_aggr(struct perf_evsel *counter)
 */
 static int read_counter(struct perf_evsel *counter)
 {
-        u64 *count;
+        int nthreads = thread_map__nr(evsel_list->threads);
-        int cpu;
+        int ncpus = perf_evsel__nr_cpus(counter);
+        int cpu, thread;
-        for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+        if (counter->system_wide)
-                if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
+                nthreads = 1;
-                        return -1;
-                count = counter->counts->cpu[cpu].values;
+        if (counter->per_pkg)
+                zero_per_pkg(counter);
-                update_shadow_stats(counter, count);
+        for (thread = 0; thread < nthreads; thread++) {
+                for (cpu = 0; cpu < ncpus; cpu++) {
+                        if (perf_evsel__read_cb(counter, cpu, thread, read_cb))
+                                return -1;
+                }
        }
        return 0;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 517ed84db97a..cf524a35cc84 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value)
                callchain_param.key = CCKEY_ADDRESS;
                return 0;
        }
+        if (!strncmp(value, "branch", strlen(value))) {
+                callchain_param.branch_callstack = 1;
+                return 0;
+        }
        return -1;
 }
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 3f158474c892..dbc08cf5f970 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -63,6 +63,7 @@ struct callchain_param {
        sort_chain_func_t       sort;
        enum chain_order        order;
        enum chain_key          key;
+        bool                    branch_callstack;
 };
 extern struct callchain_param callchain_param;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2d26b7ad6fe0..1e90c8557ede 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
        return 0;
 }
-int __perf_evsel__read(struct perf_evsel *evsel,
-                       int ncpus, int nthreads, bool scale)
-{
-        size_t nv = scale ? 3 : 1;
-        int cpu, thread;
-        struct perf_counts_values *aggr = &evsel->counts->aggr, count;
-        if (evsel->system_wide)
-                nthreads = 1;
-        aggr->val = aggr->ena = aggr->run = 0;
-        for (cpu = 0; cpu < ncpus; cpu++) {
-                for (thread = 0; thread < nthreads; thread++) {
-                        if (FD(evsel, cpu, thread) < 0)
-                                continue;
-                        if (readn(FD(evsel, cpu, thread),
-                                  &count, nv * sizeof(u64)) < 0)
-                                return -errno;
-                        aggr->val += count.val;
-                        if (scale) {
-                                aggr->ena += count.ena;
-                                aggr->run += count.run;
-                        }
-                }
-        }
-        perf_evsel__compute_deltas(evsel, -1, aggr);
-        perf_counts_values__scale(aggr, scale, &evsel->counts->scaled);
-        return 0;
-}
 static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
 {
        struct perf_evsel *leader = evsel->leader;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index b18d58da580b..38622747d130 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -93,6 +93,7 @@ struct perf_evsel {
        bool                    system_wide;
        bool                    tracking;
        bool                    per_pkg;
+        unsigned long           *per_pkg_mask;
        /* parse modifier helper */
        int                     exclude_GH;
        int                     nr_members;
@@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
        return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
 }
-int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
-                       bool scale);
-/**
- * perf_evsel__read - Read the aggregate results on all CPUs
- *
- * @evsel - event selector to read value
- * @ncpus - Number of cpus affected, from zero
- * @nthreads - Number of threads affected, from zero
- */
-static inline int perf_evsel__read(struct perf_evsel *evsel,
-                                    int ncpus, int nthreads)
-{
-        return __perf_evsel__read(evsel, ncpus, nthreads, false);
-}
-/**
- * perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
- *
- * @evsel - event selector to read value
- * @ncpus - Number of cpus affected, from zero
- * @nthreads - Number of threads affected, from zero
- */
-static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
-                                          int ncpus, int nthreads)
-{
-        return __perf_evsel__read(evsel, ncpus, nthreads, true);
-}
 int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                             struct perf_sample *sample);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b75b487574c7..15dd0a9691ce 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -12,6 +12,7 @@
 #include <stdbool.h>
 #include <symbol/kallsyms.h>
 #include "unwind.h"
+#include "linux/hash.h"
 static void dsos__init(struct dsos *dsos)
 {
@@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread,
        al.filtered = 0;
        al.sym = NULL;
-        thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
+        if (cpumode == -1)
+                thread__find_cpumode_addr_location(thread, MAP__FUNCTION,
+                                                   ip, &al);
+        else
+                thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
                                   ip, &al);
        if (al.sym != NULL) {
                if (sort__has_parent && !*parent &&
@@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
        return bi;
 }
+#define CHASHSZ 127
+#define CHASHBITS 7
+#define NO_ENTRY 0xff
+#define PERF_MAX_BRANCH_DEPTH 127
+/* Remove loops. */
+static int remove_loops(struct branch_entry *l, int nr)
+{
+        int i, j, off;
+        unsigned char chash[CHASHSZ];
+        memset(chash, NO_ENTRY, sizeof(chash));
+        BUG_ON(PERF_MAX_BRANCH_DEPTH > 255);
+        for (i = 0; i < nr; i++) {
+                int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ;
+                /* no collision handling for now */
+                if (chash[h] == NO_ENTRY) {
+                        chash[h] = i;
+                } else if (l[chash[h]].from == l[i].from) {
+                        bool is_loop = true;
+                        /* check if it is a real loop */
+                        off = 0;
+                        for (j = chash[h]; j < i && i + off < nr; j++, off++)
+                                if (l[j].from != l[i + off].from) {
+                                        is_loop = false;
+                                        break;
+                                }
+                        if (is_loop) {
+                                memmove(l + i, l + i + off,
+                                        (nr - (i + off)) * sizeof(*l));
+                                nr -= off;
+                        }
+                }
+        }
+        return nr;
+}
 static int thread__resolve_callchain_sample(struct thread *thread,
                                             struct ip_callchain *chain,
+                                             struct branch_stack *branch,
                                             struct symbol **parent,
                                             struct addr_location *root_al,
                                             int max_stack)
@@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread,
        int i;
        int j;
        int err;
-        int skip_idx __maybe_unused;
+        int skip_idx = -1;
+        int first_call = 0;
+        /*
+         * Based on DWARF debug information, some architectures skip
+         * a callchain entry saved by the kernel.
+         */
+        if (chain->nr < PERF_MAX_STACK_DEPTH)
+                skip_idx = arch_skip_callchain_idx(thread, chain);
        callchain_cursor_reset(&callchain_cursor);
+        /*
+         * Add branches to call stack for easier browsing. This gives
+         * more context for a sample than just the callers.
+         *
+         * This uses individual histograms of paths compared to the
+         * aggregated histograms the normal LBR mode uses.
+         *
+         * Limitations for now:
+         * - No extra filters
+         * - No annotations (should annotate somehow)
+         */
+        if (branch && callchain_param.branch_callstack) {
+                int nr = min(max_stack, (int)branch->nr);
+                struct branch_entry be[nr];
+                if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
+                        pr_warning("corrupted branch chain. skipping...\n");
+                        goto check_calls;
+                }
+                for (i = 0; i < nr; i++) {
+                        if (callchain_param.order == ORDER_CALLEE) {
+                                be[i] = branch->entries[i];
+                                /*
+                                 * Check for overlap into the callchain.
+                                 * The return address is one off compared to
+                                 * the branch entry. To adjust for this
+                                 * assume the calling instruction is not longer
+                                 * than 8 bytes.
+                                 */
+                                if (i == skip_idx ||
+                                    chain->ips[first_call] >= PERF_CONTEXT_MAX)
+                                        first_call++;
+                                else if (be[i].from < chain->ips[first_call] &&
+                                    be[i].from >= chain->ips[first_call] - 8)
+                                        first_call++;
+                        } else
+                                be[i] = branch->entries[branch->nr - i - 1];
+                }
+                nr = remove_loops(be, nr);
+                for (i = 0; i < nr; i++) {
+                        err = add_callchain_ip(thread, parent, root_al,
+                                               -1, be[i].to);
+                        if (!err)
+                                err = add_callchain_ip(thread, parent, root_al,
+                                                       -1, be[i].from);
+                        if (err == -EINVAL)
+                                break;
+                        if (err)
+                                return err;
+                }
+                chain_nr -= nr;
+        }
+check_calls:
        if (chain->nr > PERF_MAX_STACK_DEPTH) {
                pr_warning("corrupted callchain. skipping...\n");
                return 0;
        }
-        /*
+        for (i = first_call; i < chain_nr; i++) {
-         * Based on DWARF debug information, some architectures skip
-         * a callchain entry saved by the kernel.
-         */
-        skip_idx = arch_skip_callchain_idx(thread, chain);
-        for (i = 0; i < chain_nr; i++) {
                u64 ip;
                if (callchain_param.order == ORDER_CALLEE)
@@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread,
                              int max_stack)
 {
        int ret = thread__resolve_callchain_sample(thread, sample->callchain,
+                                                   sample->branch_stack,
                                                   parent, root_al, max_stack);
        if (ret)
                return ret;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index e0b297c50f9d..9d602e9c6f59 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -102,7 +102,8 @@ struct symbol_conf {
                        demangle,
                        demangle_kernel,
                        filter_relative,
-                        show_hist_headers;
+                        show_hist_headers,
+                        branch_callstack;
        const char      *vmlinux_name,
                        *kallsyms_name,
                        *source_prefix,