aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2016-06-08 03:29:23 -0400
committerIngo Molnar <mingo@kernel.org>2016-06-08 03:29:23 -0400
commitaa3a655b159f11b1afe0dcdac5fb5b172f02b778 (patch)
tree577058fb95c7f2e2aacb3566874e75e17fcec9f8
parent616d1c1b98ac79f30216a57a170dd7cea19b3df3 (diff)
parent7db91f251056f90fec4121f028680ab3153a0f3c (diff)
Merge tag 'perf-core-for-mingo-20160606' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: User visible changes: - Tooling support for TopDown counters, recently added to the kernel (Andi Kleen) - Show call graphs in 'perf script' when 1st event doesn't have it but some other has (He Kuang) - Fix terminal cleanup when handling invalid .perfconfig files in 'perf top' (Taeung Song) Build fixes: - Respect CROSS_COMPILE for the linker in libapi (Lucas Stach) Infrastructure changes: - Fix perf_evlist__alloc_mmap() failure path (Wang Nan) - Provide way to extract integer value from format_field (Arnaldo Carvalho de Melo) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--tools/lib/api/Makefile1
-rw-r--r--tools/perf/Documentation/perf-stat.txt32
-rw-r--r--tools/perf/arch/x86/util/Build1
-rw-r--r--tools/perf/arch/x86/util/group.c27
-rw-r--r--tools/perf/builtin-script.c23
-rw-r--r--tools/perf/builtin-stat.c165
-rw-r--r--tools/perf/tests/parse-events.c4
-rw-r--r--tools/perf/util/config.c22
-rw-r--r--tools/perf/util/evlist.c5
-rw-r--r--tools/perf/util/evsel.c25
-rw-r--r--tools/perf/util/evsel.h2
-rw-r--r--tools/perf/util/group.h7
-rw-r--r--tools/perf/util/parse-events.l1
-rw-r--r--tools/perf/util/stat-shadow.c162
-rw-r--r--tools/perf/util/stat.c5
-rw-r--r--tools/perf/util/stat.h5
16 files changed, 441 insertions, 46 deletions
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index 316f308a63ea..67ff93ec1515 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -10,6 +10,7 @@ endif
10 10
11CC = $(CROSS_COMPILE)gcc 11CC = $(CROSS_COMPILE)gcc
12AR = $(CROSS_COMPILE)ar 12AR = $(CROSS_COMPILE)ar
13LD = $(CROSS_COMPILE)ld
13 14
14MAKEFLAGS += --no-print-directory 15MAKEFLAGS += --no-print-directory
15 16
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 04f23b404bbc..d96ccd4844df 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -204,6 +204,38 @@ Aggregate counts per physical processor for system-wide mode measurements.
204--no-aggr:: 204--no-aggr::
205Do not aggregate counts across all monitored CPUs. 205Do not aggregate counts across all monitored CPUs.
206 206
207--topdown::
208Print top down level 1 metrics if supported by the CPU. This allows to
209determine bottle necks in the CPU pipeline for CPU bound workloads,
210by breaking the cycles consumed down into frontend bound, backend bound,
211bad speculation and retiring.
212
213Frontend bound means that the CPU cannot fetch and decode instructions fast
214enough. Backend bound means that computation or memory access is the bottle
215neck. Bad Speculation means that the CPU wasted cycles due to branch
216mispredictions and similar issues. Retiring means that the CPU computed without
217an apparently bottleneck. The bottleneck is only the real bottleneck
218if the workload is actually bound by the CPU and not by something else.
219
220For best results it is usually a good idea to use it with interval
221mode like -I 1000, as the bottleneck of workloads can change often.
222
223The top down metrics are collected per core instead of per
224CPU thread. Per core mode is automatically enabled
225and -a (global monitoring) is needed, requiring root rights or
226perf.perf_event_paranoid=-1.
227
228Topdown uses the full Performance Monitoring Unit, and needs
229disabling of the NMI watchdog (as root):
230echo 0 > /proc/sys/kernel/nmi_watchdog
231for best results. Otherwise the bottlenecks may be inconsistent
232on workload with changing phases.
233
234This enables --metric-only, unless overriden with --no-metric-only.
235
236To interpret the results it is usually needed to know on which
237CPUs the workload runs on. If needed the CPUs can be forced using
238taskset.
207 239
208EXAMPLES 240EXAMPLES
209-------- 241--------
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 465970370f3e..4cd8a16b1b7b 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -3,6 +3,7 @@ libperf-y += tsc.o
3libperf-y += pmu.o 3libperf-y += pmu.o
4libperf-y += kvm-stat.o 4libperf-y += kvm-stat.o
5libperf-y += perf_regs.o 5libperf-y += perf_regs.o
6libperf-y += group.o
6 7
7libperf-$(CONFIG_DWARF) += dwarf-regs.o 8libperf-$(CONFIG_DWARF) += dwarf-regs.o
8libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o 9libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/group.c b/tools/perf/arch/x86/util/group.c
new file mode 100644
index 000000000000..37f92aa39a5d
--- /dev/null
+++ b/tools/perf/arch/x86/util/group.c
@@ -0,0 +1,27 @@
1#include <stdio.h>
2#include "api/fs/fs.h"
3#include "util/group.h"
4
5/*
6 * Check whether we can use a group for top down.
7 * Without a group may get bad results due to multiplexing.
8 */
9bool arch_topdown_check_group(bool *warn)
10{
11 int n;
12
13 if (sysctl__read_int("kernel/nmi_watchdog", &n) < 0)
14 return false;
15 if (n > 0) {
16 *warn = true;
17 return false;
18 }
19 return true;
20}
21
22void arch_topdown_group_warn(void)
23{
24 fprintf(stderr,
25 "nmi_watchdog enabled with topdown. May give wrong results.\n"
26 "Disable with echo 0 > /proc/sys/kernel/nmi_watchdog\n");
27}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index e3ce2f34d3ad..46011235af5d 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -339,7 +339,7 @@ static void set_print_ip_opts(struct perf_event_attr *attr)
339 */ 339 */
340static int perf_session__check_output_opt(struct perf_session *session) 340static int perf_session__check_output_opt(struct perf_session *session)
341{ 341{
342 int j; 342 unsigned int j;
343 struct perf_evsel *evsel; 343 struct perf_evsel *evsel;
344 344
345 for (j = 0; j < PERF_TYPE_MAX; ++j) { 345 for (j = 0; j < PERF_TYPE_MAX; ++j) {
@@ -388,17 +388,20 @@ static int perf_session__check_output_opt(struct perf_session *session)
388 struct perf_event_attr *attr; 388 struct perf_event_attr *attr;
389 389
390 j = PERF_TYPE_TRACEPOINT; 390 j = PERF_TYPE_TRACEPOINT;
391 evsel = perf_session__find_first_evtype(session, j);
392 if (evsel == NULL)
393 goto out;
394 391
395 attr = &evsel->attr; 392 evlist__for_each(session->evlist, evsel) {
393 if (evsel->attr.type != j)
394 continue;
395
396 attr = &evsel->attr;
396 397
397 if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) { 398 if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
398 output[j].fields |= PERF_OUTPUT_IP; 399 output[j].fields |= PERF_OUTPUT_IP;
399 output[j].fields |= PERF_OUTPUT_SYM; 400 output[j].fields |= PERF_OUTPUT_SYM;
400 output[j].fields |= PERF_OUTPUT_DSO; 401 output[j].fields |= PERF_OUTPUT_DSO;
401 set_print_ip_opts(attr); 402 set_print_ip_opts(attr);
403 goto out;
404 }
402 } 405 }
403 } 406 }
404 407
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ee7ada78d86f..dff63733dfb7 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,10 +59,13 @@
59#include "util/thread.h" 59#include "util/thread.h"
60#include "util/thread_map.h" 60#include "util/thread_map.h"
61#include "util/counts.h" 61#include "util/counts.h"
62#include "util/group.h"
62#include "util/session.h" 63#include "util/session.h"
63#include "util/tool.h" 64#include "util/tool.h"
65#include "util/group.h"
64#include "asm/bug.h" 66#include "asm/bug.h"
65 67
68#include <api/fs/fs.h>
66#include <stdlib.h> 69#include <stdlib.h>
67#include <sys/prctl.h> 70#include <sys/prctl.h>
68#include <locale.h> 71#include <locale.h>
@@ -98,6 +101,15 @@ static const char * transaction_limited_attrs = {
98 "}" 101 "}"
99}; 102};
100 103
104static const char * topdown_attrs[] = {
105 "topdown-total-slots",
106 "topdown-slots-retired",
107 "topdown-recovery-bubbles",
108 "topdown-fetch-bubbles",
109 "topdown-slots-issued",
110 NULL,
111};
112
101static struct perf_evlist *evsel_list; 113static struct perf_evlist *evsel_list;
102 114
103static struct target target = { 115static struct target target = {
@@ -112,6 +124,7 @@ static volatile pid_t child_pid = -1;
112static bool null_run = false; 124static bool null_run = false;
113static int detailed_run = 0; 125static int detailed_run = 0;
114static bool transaction_run; 126static bool transaction_run;
127static bool topdown_run = false;
115static bool big_num = true; 128static bool big_num = true;
116static int big_num_opt = -1; 129static int big_num_opt = -1;
117static const char *csv_sep = NULL; 130static const char *csv_sep = NULL;
@@ -124,6 +137,7 @@ static unsigned int initial_delay = 0;
124static unsigned int unit_width = 4; /* strlen("unit") */ 137static unsigned int unit_width = 4; /* strlen("unit") */
125static bool forever = false; 138static bool forever = false;
126static bool metric_only = false; 139static bool metric_only = false;
140static bool force_metric_only = false;
127static struct timespec ref_time; 141static struct timespec ref_time;
128static struct cpu_map *aggr_map; 142static struct cpu_map *aggr_map;
129static aggr_get_id_t aggr_get_id; 143static aggr_get_id_t aggr_get_id;
@@ -1302,7 +1316,15 @@ static int aggr_header_lens[] = {
1302 [AGGR_GLOBAL] = 0, 1316 [AGGR_GLOBAL] = 0,
1303}; 1317};
1304 1318
1305static void print_metric_headers(char *prefix) 1319static const char *aggr_header_csv[] = {
1320 [AGGR_CORE] = "core,cpus,",
1321 [AGGR_SOCKET] = "socket,cpus",
1322 [AGGR_NONE] = "cpu,",
1323 [AGGR_THREAD] = "comm-pid,",
1324 [AGGR_GLOBAL] = ""
1325};
1326
1327static void print_metric_headers(const char *prefix, bool no_indent)
1306{ 1328{
1307 struct perf_stat_output_ctx out; 1329 struct perf_stat_output_ctx out;
1308 struct perf_evsel *counter; 1330 struct perf_evsel *counter;
@@ -1313,9 +1335,15 @@ static void print_metric_headers(char *prefix)
1313 if (prefix) 1335 if (prefix)
1314 fprintf(stat_config.output, "%s", prefix); 1336 fprintf(stat_config.output, "%s", prefix);
1315 1337
1316 if (!csv_output) 1338 if (!csv_output && !no_indent)
1317 fprintf(stat_config.output, "%*s", 1339 fprintf(stat_config.output, "%*s",
1318 aggr_header_lens[stat_config.aggr_mode], ""); 1340 aggr_header_lens[stat_config.aggr_mode], "");
1341 if (csv_output) {
1342 if (stat_config.interval)
1343 fputs("time,", stat_config.output);
1344 fputs(aggr_header_csv[stat_config.aggr_mode],
1345 stat_config.output);
1346 }
1319 1347
1320 /* Print metrics headers only */ 1348 /* Print metrics headers only */
1321 evlist__for_each(evsel_list, counter) { 1349 evlist__for_each(evsel_list, counter) {
@@ -1338,28 +1366,40 @@ static void print_interval(char *prefix, struct timespec *ts)
1338 1366
1339 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); 1367 sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
1340 1368
1341 if (num_print_interval == 0 && !csv_output && !metric_only) { 1369 if (num_print_interval == 0 && !csv_output) {
1342 switch (stat_config.aggr_mode) { 1370 switch (stat_config.aggr_mode) {
1343 case AGGR_SOCKET: 1371 case AGGR_SOCKET:
1344 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 1372 fprintf(output, "# time socket cpus");
1373 if (!metric_only)
1374 fprintf(output, " counts %*s events\n", unit_width, "unit");
1345 break; 1375 break;
1346 case AGGR_CORE: 1376 case AGGR_CORE:
1347 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 1377 fprintf(output, "# time core cpus");
1378 if (!metric_only)
1379 fprintf(output, " counts %*s events\n", unit_width, "unit");
1348 break; 1380 break;
1349 case AGGR_NONE: 1381 case AGGR_NONE:
1350 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 1382 fprintf(output, "# time CPU");
1383 if (!metric_only)
1384 fprintf(output, " counts %*s events\n", unit_width, "unit");
1351 break; 1385 break;
1352 case AGGR_THREAD: 1386 case AGGR_THREAD:
1353 fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit"); 1387 fprintf(output, "# time comm-pid");
1388 if (!metric_only)
1389 fprintf(output, " counts %*s events\n", unit_width, "unit");
1354 break; 1390 break;
1355 case AGGR_GLOBAL: 1391 case AGGR_GLOBAL:
1356 default: 1392 default:
1357 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 1393 fprintf(output, "# time");
1394 if (!metric_only)
1395 fprintf(output, " counts %*s events\n", unit_width, "unit");
1358 case AGGR_UNSET: 1396 case AGGR_UNSET:
1359 break; 1397 break;
1360 } 1398 }
1361 } 1399 }
1362 1400
1401 if (num_print_interval == 0 && metric_only)
1402 print_metric_headers(" ", true);
1363 if (++num_print_interval == 25) 1403 if (++num_print_interval == 25)
1364 num_print_interval = 0; 1404 num_print_interval = 0;
1365} 1405}
@@ -1428,8 +1468,8 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
1428 if (metric_only) { 1468 if (metric_only) {
1429 static int num_print_iv; 1469 static int num_print_iv;
1430 1470
1431 if (num_print_iv == 0) 1471 if (num_print_iv == 0 && !interval)
1432 print_metric_headers(prefix); 1472 print_metric_headers(prefix, false);
1433 if (num_print_iv++ == 25) 1473 if (num_print_iv++ == 25)
1434 num_print_iv = 0; 1474 num_print_iv = 0;
1435 if (stat_config.aggr_mode == AGGR_GLOBAL && prefix) 1475 if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
@@ -1520,6 +1560,14 @@ static int stat__set_big_num(const struct option *opt __maybe_unused,
1520 return 0; 1560 return 0;
1521} 1561}
1522 1562
1563static int enable_metric_only(const struct option *opt __maybe_unused,
1564 const char *s __maybe_unused, int unset)
1565{
1566 force_metric_only = true;
1567 metric_only = !unset;
1568 return 0;
1569}
1570
1523static const struct option stat_options[] = { 1571static const struct option stat_options[] = {
1524 OPT_BOOLEAN('T', "transaction", &transaction_run, 1572 OPT_BOOLEAN('T', "transaction", &transaction_run,
1525 "hardware transaction statistics"), 1573 "hardware transaction statistics"),
@@ -1578,8 +1626,10 @@ static const struct option stat_options[] = {
1578 "aggregate counts per thread", AGGR_THREAD), 1626 "aggregate counts per thread", AGGR_THREAD),
1579 OPT_UINTEGER('D', "delay", &initial_delay, 1627 OPT_UINTEGER('D', "delay", &initial_delay,
1580 "ms to wait before starting measurement after program start"), 1628 "ms to wait before starting measurement after program start"),
1581 OPT_BOOLEAN(0, "metric-only", &metric_only, 1629 OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
1582 "Only print computed metrics. No raw values"), 1630 "Only print computed metrics. No raw values", enable_metric_only),
1631 OPT_BOOLEAN(0, "topdown", &topdown_run,
1632 "measure topdown level 1 statistics"),
1583 OPT_END() 1633 OPT_END()
1584}; 1634};
1585 1635
@@ -1772,12 +1822,62 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
1772 return 0; 1822 return 0;
1773} 1823}
1774 1824
1825static int topdown_filter_events(const char **attr, char **str, bool use_group)
1826{
1827 int off = 0;
1828 int i;
1829 int len = 0;
1830 char *s;
1831
1832 for (i = 0; attr[i]; i++) {
1833 if (pmu_have_event("cpu", attr[i])) {
1834 len += strlen(attr[i]) + 1;
1835 attr[i - off] = attr[i];
1836 } else
1837 off++;
1838 }
1839 attr[i - off] = NULL;
1840
1841 *str = malloc(len + 1 + 2);
1842 if (!*str)
1843 return -1;
1844 s = *str;
1845 if (i - off == 0) {
1846 *s = 0;
1847 return 0;
1848 }
1849 if (use_group)
1850 *s++ = '{';
1851 for (i = 0; attr[i]; i++) {
1852 strcpy(s, attr[i]);
1853 s += strlen(s);
1854 *s++ = ',';
1855 }
1856 if (use_group) {
1857 s[-1] = '}';
1858 *s = 0;
1859 } else
1860 s[-1] = 0;
1861 return 0;
1862}
1863
1864__weak bool arch_topdown_check_group(bool *warn)
1865{
1866 *warn = false;
1867 return false;
1868}
1869
1870__weak void arch_topdown_group_warn(void)
1871{
1872}
1873
1775/* 1874/*
1776 * Add default attributes, if there were no attributes specified or 1875 * Add default attributes, if there were no attributes specified or
1777 * if -d/--detailed, -d -d or -d -d -d is used: 1876 * if -d/--detailed, -d -d or -d -d -d is used:
1778 */ 1877 */
1779static int add_default_attributes(void) 1878static int add_default_attributes(void)
1780{ 1879{
1880 int err;
1781 struct perf_event_attr default_attrs0[] = { 1881 struct perf_event_attr default_attrs0[] = {
1782 1882
1783 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1883 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
@@ -1896,7 +1996,6 @@ static int add_default_attributes(void)
1896 return 0; 1996 return 0;
1897 1997
1898 if (transaction_run) { 1998 if (transaction_run) {
1899 int err;
1900 if (pmu_have_event("cpu", "cycles-ct") && 1999 if (pmu_have_event("cpu", "cycles-ct") &&
1901 pmu_have_event("cpu", "el-start")) 2000 pmu_have_event("cpu", "el-start"))
1902 err = parse_events(evsel_list, transaction_attrs, NULL); 2001 err = parse_events(evsel_list, transaction_attrs, NULL);
@@ -1909,6 +2008,46 @@ static int add_default_attributes(void)
1909 return 0; 2008 return 0;
1910 } 2009 }
1911 2010
2011 if (topdown_run) {
2012 char *str = NULL;
2013 bool warn = false;
2014
2015 if (stat_config.aggr_mode != AGGR_GLOBAL &&
2016 stat_config.aggr_mode != AGGR_CORE) {
2017 pr_err("top down event configuration requires --per-core mode\n");
2018 return -1;
2019 }
2020 stat_config.aggr_mode = AGGR_CORE;
2021 if (nr_cgroups || !target__has_cpu(&target)) {
2022 pr_err("top down event configuration requires system-wide mode (-a)\n");
2023 return -1;
2024 }
2025
2026 if (!force_metric_only)
2027 metric_only = true;
2028 if (topdown_filter_events(topdown_attrs, &str,
2029 arch_topdown_check_group(&warn)) < 0) {
2030 pr_err("Out of memory\n");
2031 return -1;
2032 }
2033 if (topdown_attrs[0] && str) {
2034 if (warn)
2035 arch_topdown_group_warn();
2036 err = parse_events(evsel_list, str, NULL);
2037 if (err) {
2038 fprintf(stderr,
2039 "Cannot set up top down events %s: %d\n",
2040 str, err);
2041 free(str);
2042 return -1;
2043 }
2044 } else {
2045 fprintf(stderr, "System does not support topdown\n");
2046 return -1;
2047 }
2048 free(str);
2049 }
2050
1912 if (!evsel_list->nr_entries) { 2051 if (!evsel_list->nr_entries) {
1913 if (target__has_cpu(&target)) 2052 if (target__has_cpu(&target))
1914 default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK; 2053 default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 7865f68dc0d8..b2a2c74136a5 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1783,8 +1783,8 @@ static int test_pmu_events(void)
1783 struct evlist_test e; 1783 struct evlist_test e;
1784 char name[MAX_NAME]; 1784 char name[MAX_NAME];
1785 1785
1786 if (!strcmp(ent->d_name, ".") || 1786 /* Names containing . are special and cannot be used directly */
1787 !strcmp(ent->d_name, "..")) 1787 if (strchr(ent->d_name, '.'))
1788 continue; 1788 continue;
1789 1789
1790 snprintf(name, MAX_NAME, "cpu/event=%s/u", ent->d_name); 1790 snprintf(name, MAX_NAME, "cpu/event=%s/u", ent->d_name);
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index dad7d8272168..c73f1c4d1ca9 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -275,7 +275,8 @@ static int perf_parse_file(config_fn_t fn, void *data)
275 break; 275 break;
276 } 276 }
277 } 277 }
278 die("bad config file line %d in %s", config_linenr, config_file_name); 278 pr_err("bad config file line %d in %s\n", config_linenr, config_file_name);
279 return -1;
279} 280}
280 281
281static int parse_unit_factor(const char *end, unsigned long *val) 282static int parse_unit_factor(const char *end, unsigned long *val)
@@ -479,16 +480,15 @@ static int perf_config_global(void)
479 480
480int perf_config(config_fn_t fn, void *data) 481int perf_config(config_fn_t fn, void *data)
481{ 482{
482 int ret = 0, found = 0; 483 int ret = -1;
483 const char *home = NULL; 484 const char *home = NULL;
484 485
485 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */ 486 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
486 if (config_exclusive_filename) 487 if (config_exclusive_filename)
487 return perf_config_from_file(fn, config_exclusive_filename, data); 488 return perf_config_from_file(fn, config_exclusive_filename, data);
488 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) { 489 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
489 ret += perf_config_from_file(fn, perf_etc_perfconfig(), 490 if (perf_config_from_file(fn, perf_etc_perfconfig(), data) < 0)
490 data); 491 goto out;
491 found += 1;
492 } 492 }
493 493
494 home = getenv("HOME"); 494 home = getenv("HOME");
@@ -514,14 +514,12 @@ int perf_config(config_fn_t fn, void *data)
514 if (!st.st_size) 514 if (!st.st_size)
515 goto out_free; 515 goto out_free;
516 516
517 ret += perf_config_from_file(fn, user_config, data); 517 ret = perf_config_from_file(fn, user_config, data);
518 found += 1; 518
519out_free: 519out_free:
520 free(user_config); 520 free(user_config);
521 } 521 }
522out: 522out:
523 if (found == 0)
524 return -1;
525 return ret; 523 return ret;
526} 524}
527 525
@@ -609,8 +607,12 @@ static int collect_config(const char *var, const char *value,
609 struct perf_config_section *section = NULL; 607 struct perf_config_section *section = NULL;
610 struct perf_config_item *item = NULL; 608 struct perf_config_item *item = NULL;
611 struct perf_config_set *set = perf_config_set; 609 struct perf_config_set *set = perf_config_set;
612 struct list_head *sections = &set->sections; 610 struct list_head *sections;
611
612 if (set == NULL)
613 return -1;
613 614
615 sections = &set->sections;
614 key = ptr = strdup(var); 616 key = ptr = strdup(var);
615 if (!key) { 617 if (!key) {
616 pr_debug("%s: strdup failed\n", __func__); 618 pr_debug("%s: strdup failed\n", __func__);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e0f30946ed1a..1b918aa075d6 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -946,9 +946,12 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
946 if (cpu_map__empty(evlist->cpus)) 946 if (cpu_map__empty(evlist->cpus))
947 evlist->nr_mmaps = thread_map__nr(evlist->threads); 947 evlist->nr_mmaps = thread_map__nr(evlist->threads);
948 evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap)); 948 evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
949 if (!evlist->mmap)
950 return -ENOMEM;
951
949 for (i = 0; i < evlist->nr_mmaps; i++) 952 for (i = 0; i < evlist->nr_mmaps; i++)
950 evlist->mmap[i].fd = -1; 953 evlist->mmap[i].fd = -1;
951 return evlist->mmap != NULL ? 0 : -ENOMEM; 954 return 0;
952} 955}
953 956
954struct mmap_params { 957struct mmap_params {
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 18e18f1d435e..9b2e3e624efe 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2251,17 +2251,11 @@ void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample,
2251 return sample->raw_data + offset; 2251 return sample->raw_data + offset;
2252} 2252}
2253 2253
2254u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, 2254u64 format_field__intval(struct format_field *field, struct perf_sample *sample,
2255 const char *name) 2255 bool needs_swap)
2256{ 2256{
2257 struct format_field *field = perf_evsel__field(evsel, name);
2258 void *ptr;
2259 u64 value; 2257 u64 value;
2260 2258 void *ptr = sample->raw_data + field->offset;
2261 if (!field)
2262 return 0;
2263
2264 ptr = sample->raw_data + field->offset;
2265 2259
2266 switch (field->size) { 2260 switch (field->size) {
2267 case 1: 2261 case 1:
@@ -2279,7 +2273,7 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
2279 return 0; 2273 return 0;
2280 } 2274 }
2281 2275
2282 if (!evsel->needs_swap) 2276 if (!needs_swap)
2283 return value; 2277 return value;
2284 2278
2285 switch (field->size) { 2279 switch (field->size) {
@@ -2296,6 +2290,17 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
2296 return 0; 2290 return 0;
2297} 2291}
2298 2292
2293u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
2294 const char *name)
2295{
2296 struct format_field *field = perf_evsel__field(evsel, name);
2297
2298 if (!field)
2299 return 0;
2300
2301 return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
2302}
2303
2299bool perf_evsel__fallback(struct perf_evsel *evsel, int err, 2304bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
2300 char *msg, size_t msgsize) 2305 char *msg, size_t msgsize)
2301{ 2306{
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 028412b32d5a..828ddd1c8947 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -261,6 +261,8 @@ static inline char *perf_evsel__strval(struct perf_evsel *evsel,
261 261
262struct format_field; 262struct format_field;
263 263
264u64 format_field__intval(struct format_field *field, struct perf_sample *sample, bool needs_swap);
265
264struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name); 266struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name);
265 267
266#define perf_evsel__match(evsel, t, c) \ 268#define perf_evsel__match(evsel, t, c) \
diff --git a/tools/perf/util/group.h b/tools/perf/util/group.h
new file mode 100644
index 000000000000..116debe7a995
--- /dev/null
+++ b/tools/perf/util/group.h
@@ -0,0 +1,7 @@
1#ifndef GROUP_H
2#define GROUP_H 1
3
4bool arch_topdown_check_group(bool *warn);
5void arch_topdown_group_warn(void);
6
7#endif
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 01af1ee90a27..3c15b33b2e84 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -260,6 +260,7 @@ cycles-ct { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
260cycles-t { return str(yyscanner, PE_KERNEL_PMU_EVENT); } 260cycles-t { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
261mem-loads { return str(yyscanner, PE_KERNEL_PMU_EVENT); } 261mem-loads { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
262mem-stores { return str(yyscanner, PE_KERNEL_PMU_EVENT); } 262mem-stores { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
263topdown-[a-z-]+ { return str(yyscanner, PE_KERNEL_PMU_EVENT); }
263 264
264L1-dcache|l1-d|l1d|L1-data | 265L1-dcache|l1-d|l1d|L1-data |
265L1-icache|l1-i|l1i|L1-instruction | 266L1-icache|l1-i|l1i|L1-instruction |
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index aa9efe08762b..8a2bbd2a4d82 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -36,6 +36,11 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
36static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS]; 36static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
37static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS]; 37static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
38static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS]; 38static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
39static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
40static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
41static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
42static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
43static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
39static bool have_frontend_stalled; 44static bool have_frontend_stalled;
40 45
41struct stats walltime_nsecs_stats; 46struct stats walltime_nsecs_stats;
@@ -82,6 +87,11 @@ void perf_stat__reset_shadow_stats(void)
82 sizeof(runtime_transaction_stats)); 87 sizeof(runtime_transaction_stats));
83 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 88 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
84 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 89 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
90 memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
91 memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
92 memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
93 memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
94 memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
85} 95}
86 96
87/* 97/*
@@ -105,6 +115,16 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
105 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); 115 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
106 else if (perf_stat_evsel__is(counter, ELISION_START)) 116 else if (perf_stat_evsel__is(counter, ELISION_START))
107 update_stats(&runtime_elision_stats[ctx][cpu], count[0]); 117 update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
118 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
119 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
120 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
121 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
122 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
123 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
124 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
125 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
126 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
127 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
108 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 128 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
109 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); 129 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
110 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 130 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -302,6 +322,107 @@ static void print_ll_cache_misses(int cpu,
302 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); 322 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
303} 323}
304 324
325/*
326 * High level "TopDown" CPU core pipe line bottleneck break down.
327 *
328 * Basic concept following
329 * Yasin, A Top Down Method for Performance analysis and Counter architecture
330 * ISPASS14
331 *
332 * The CPU pipeline is divided into 4 areas that can be bottlenecks:
333 *
334 * Frontend -> Backend -> Retiring
335 * BadSpeculation in addition means out of order execution that is thrown away
336 * (for example branch mispredictions)
337 * Frontend is instruction decoding.
338 * Backend is execution, like computation and accessing data in memory
339 * Retiring is good execution that is not directly bottlenecked
340 *
341 * The formulas are computed in slots.
342 * A slot is an entry in the pipeline each for the pipeline width
343 * (for example a 4-wide pipeline has 4 slots for each cycle)
344 *
345 * Formulas:
346 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
347 * TotalSlots
348 * Retiring = SlotsRetired / TotalSlots
349 * FrontendBound = FetchBubbles / TotalSlots
350 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
351 *
352 * The kernel provides the mapping to the low level CPU events and any scaling
353 * needed for the CPU pipeline width, for example:
354 *
355 * TotalSlots = Cycles * 4
356 *
357 * The scaling factor is communicated in the sysfs unit.
358 *
359 * In some cases the CPU may not be able to measure all the formulas due to
360 * missing events. In this case multiple formulas are combined, as possible.
361 *
362 * Full TopDown supports more levels to sub-divide each area: for example
363 * BackendBound into computing bound and memory bound. For now we only
364 * support Level 1 TopDown.
365 */
366
367static double sanitize_val(double x)
368{
369 if (x < 0 && x >= -0.02)
370 return 0.0;
371 return x;
372}
373
374static double td_total_slots(int ctx, int cpu)
375{
376 return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
377}
378
379static double td_bad_spec(int ctx, int cpu)
380{
381 double bad_spec = 0;
382 double total_slots;
383 double total;
384
385 total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
386 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
387 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
388 total_slots = td_total_slots(ctx, cpu);
389 if (total_slots)
390 bad_spec = total / total_slots;
391 return sanitize_val(bad_spec);
392}
393
394static double td_retiring(int ctx, int cpu)
395{
396 double retiring = 0;
397 double total_slots = td_total_slots(ctx, cpu);
398 double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
399
400 if (total_slots)
401 retiring = ret_slots / total_slots;
402 return retiring;
403}
404
405static double td_fe_bound(int ctx, int cpu)
406{
407 double fe_bound = 0;
408 double total_slots = td_total_slots(ctx, cpu);
409 double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
410
411 if (total_slots)
412 fe_bound = fetch_bub / total_slots;
413 return fe_bound;
414}
415
416static double td_be_bound(int ctx, int cpu)
417{
418 double sum = (td_fe_bound(ctx, cpu) +
419 td_bad_spec(ctx, cpu) +
420 td_retiring(ctx, cpu));
421 if (sum == 0)
422 return 0;
423 return sanitize_val(1.0 - sum);
424}
425
305void perf_stat__print_shadow_stats(struct perf_evsel *evsel, 426void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
306 double avg, int cpu, 427 double avg, int cpu,
307 struct perf_stat_output_ctx *out) 428 struct perf_stat_output_ctx *out)
@@ -309,6 +430,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
309 void *ctxp = out->ctx; 430 void *ctxp = out->ctx;
310 print_metric_t print_metric = out->print_metric; 431 print_metric_t print_metric = out->print_metric;
311 double total, ratio = 0.0, total2; 432 double total, ratio = 0.0, total2;
433 const char *color = NULL;
312 int ctx = evsel_context(evsel); 434 int ctx = evsel_context(evsel);
313 435
314 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 436 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
@@ -452,6 +574,46 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
452 avg / ratio); 574 avg / ratio);
453 else 575 else
454 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); 576 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
577 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
578 double fe_bound = td_fe_bound(ctx, cpu);
579
580 if (fe_bound > 0.2)
581 color = PERF_COLOR_RED;
582 print_metric(ctxp, color, "%8.1f%%", "frontend bound",
583 fe_bound * 100.);
584 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
585 double retiring = td_retiring(ctx, cpu);
586
587 if (retiring > 0.7)
588 color = PERF_COLOR_GREEN;
589 print_metric(ctxp, color, "%8.1f%%", "retiring",
590 retiring * 100.);
591 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
592 double bad_spec = td_bad_spec(ctx, cpu);
593
594 if (bad_spec > 0.1)
595 color = PERF_COLOR_RED;
596 print_metric(ctxp, color, "%8.1f%%", "bad speculation",
597 bad_spec * 100.);
598 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
599 double be_bound = td_be_bound(ctx, cpu);
600 const char *name = "backend bound";
601 static int have_recovery_bubbles = -1;
602
603 /* In case the CPU does not support topdown-recovery-bubbles */
604 if (have_recovery_bubbles < 0)
605 have_recovery_bubbles = pmu_have_event("cpu",
606 "topdown-recovery-bubbles");
607 if (!have_recovery_bubbles)
608 name = "backend bound/bad spec";
609
610 if (be_bound > 0.2)
611 color = PERF_COLOR_RED;
612 if (td_total_slots(ctx, cpu) > 0)
613 print_metric(ctxp, color, "%8.1f%%", name,
614 be_bound * 100.);
615 else
616 print_metric(ctxp, NULL, NULL, name, 0);
455 } else if (runtime_nsecs_stats[cpu].n != 0) { 617 } else if (runtime_nsecs_stats[cpu].n != 0) {
456 char unit = 'M'; 618 char unit = 'M';
457 char unit_buf[10]; 619 char unit_buf[10];
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index ffa1d0653861..c1ba255f2abe 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -79,6 +79,11 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
79 ID(TRANSACTION_START, cpu/tx-start/), 79 ID(TRANSACTION_START, cpu/tx-start/),
80 ID(ELISION_START, cpu/el-start/), 80 ID(ELISION_START, cpu/el-start/),
81 ID(CYCLES_IN_TX_CP, cpu/cycles-ct/), 81 ID(CYCLES_IN_TX_CP, cpu/cycles-ct/),
82 ID(TOPDOWN_TOTAL_SLOTS, topdown-total-slots),
83 ID(TOPDOWN_SLOTS_ISSUED, topdown-slots-issued),
84 ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
85 ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
86 ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
82}; 87};
83#undef ID 88#undef ID
84 89
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0150e786ccc7..c29bb94c48a4 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -17,6 +17,11 @@ enum perf_stat_evsel_id {
17 PERF_STAT_EVSEL_ID__TRANSACTION_START, 17 PERF_STAT_EVSEL_ID__TRANSACTION_START,
18 PERF_STAT_EVSEL_ID__ELISION_START, 18 PERF_STAT_EVSEL_ID__ELISION_START,
19 PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP, 19 PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
20 PERF_STAT_EVSEL_ID__TOPDOWN_TOTAL_SLOTS,
21 PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_ISSUED,
22 PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
23 PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
24 PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
20 PERF_STAT_EVSEL_ID__MAX, 25 PERF_STAT_EVSEL_ID__MAX,
21}; 26};
22 27