diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-05-19 07:30:56 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-05-19 08:29:51 -0400 |
commit | 2cba3ffb9a9db3874304a1739002d053d53c738b (patch) | |
tree | a81c614ea1be6db95c2d0d1a1e57287a631e9c6b | |
parent | b313207286a78abac19f1dd2721292eae598b0f5 (diff) |
perf stat: Add -d -d and -d -d -d options to show more CPU events
Print even more detailed statistics if requested via perf stat -d:
-d: detailed events, L1 and LLC data cache
-d -d: more detailed events, dTLB and iTLB events
-d -d -d: very detailed events, adding prefetch events
Full output looks like this now:
Performance counter stats for '/home/mingo/hackbench 10' (5 runs):
1703.674707 task-clock # 8.709 CPUs utilized ( +- 4.19% )
49,068 context-switches # 0.029 M/sec ( +- 16.66% )
8,303 CPU-migrations # 0.005 M/sec ( +- 24.90% )
17,397 page-faults # 0.010 M/sec ( +- 0.46% )
2,345,389,239 cycles # 1.377 GHz ( +- 4.61% ) [55.90%]
1,884,503,527 stalled-cycles-frontend # 80.35% frontend cycles idle ( +- 5.67% ) [50.39%]
743,919,737 stalled-cycles-backend # 31.72% backend cycles idle ( +- 8.75% ) [49.91%]
1,314,416,379 instructions # 0.56 insns per cycle
# 1.43 stalled cycles per insn ( +- 2.53% ) [60.87%]
272,592,567 branches # 160.003 M/sec ( +- 1.74% ) [56.56%]
3,794,846 branch-misses # 1.39% of all branches ( +- 6.59% ) [58.50%]
449,982,778 L1-dcache-loads # 264.125 M/sec ( +- 2.47% ) [49.88%]
22,404,961 L1-dcache-load-misses # 4.98% of all L1-dcache hits ( +- 6.08% ) [55.05%]
6,204,750 LLC-loads # 3.642 M/sec ( +- 8.91% ) [43.75%]
1,837,411 LLC-load-misses # 1.078 M/sec ( +- 7.27% ) [12.07%]
411,440,421 L1-icache-loads # 241.502 M/sec ( +- 5.60% ) [36.52%]
27,556,832 L1-icache-load-misses # 16.175 M/sec ( +- 7.46% ) [46.72%]
464,067,627 dTLB-loads # 272.392 M/sec ( +- 4.46% ) [54.17%]
10,765,648 dTLB-load-misses # 6.319 M/sec ( +- 3.18% ) [48.68%]
1,273,080,386 iTLB-loads # 747.256 M/sec ( +- 3.38% ) [47.53%]
117,481 iTLB-load-misses # 0.069 M/sec ( +- 14.99% ) [47.01%]
4,590,653 L1-dcache-prefetches # 2.695 M/sec ( +- 4.49% ) [46.19%]
1,712,660 L1-dcache-prefetch-misses # 1.005 M/sec ( +- 3.75% ) [44.82%]
0.195622057 seconds time elapsed ( +- 6.84% )
Also clean up the attribute construction code to be appending, and factor
it out into add_default_attributes().
Tweak the coverage percentage printout a bit, so that it's easier to view it
alongside the +- sttddev colum.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/n/tip-to3kgu04449s64062val8b62@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | tools/perf/builtin-stat.c | 209 |
1 files changed, 154 insertions, 55 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 602c3c96fa1e..a89fc0835367 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c | |||
@@ -6,24 +6,28 @@ | |||
6 | * | 6 | * |
7 | * Sample output: | 7 | * Sample output: |
8 | 8 | ||
9 | $ perf stat ~/hackbench 10 | 9 | $ perf stat ./hackbench 10 |
10 | Time: 0.104 | ||
11 | 10 | ||
12 | Performance counter stats for '/home/mingo/hackbench': | 11 | Time: 0.118 |
13 | 12 | ||
14 | 1255.538611 task clock ticks # 10.143 CPU utilization factor | 13 | Performance counter stats for './hackbench 10': |
15 | 54011 context switches # 0.043 M/sec | ||
16 | 385 CPU migrations # 0.000 M/sec | ||
17 | 17755 pagefaults # 0.014 M/sec | ||
18 | 3808323185 CPU cycles # 3033.219 M/sec | ||
19 | 1575111190 instructions # 1254.530 M/sec | ||
20 | 17367895 cache references # 13.833 M/sec | ||
21 | 7674421 cache misses # 6.112 M/sec | ||
22 | 14 | ||
23 | Wall-clock time elapsed: 123.786620 msecs | 15 | 1708.761321 task-clock # 11.037 CPUs utilized |
16 | 41,190 context-switches # 0.024 M/sec | ||
17 | 6,735 CPU-migrations # 0.004 M/sec | ||
18 | 17,318 page-faults # 0.010 M/sec | ||
19 | 5,205,202,243 cycles # 3.046 GHz | ||
20 | 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle | ||
21 | 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle | ||
22 | 2,603,501,247 instructions # 0.50 insns per cycle | ||
23 | # 1.48 stalled cycles per insn | ||
24 | 484,357,498 branches # 283.455 M/sec | ||
25 | 6,388,934 branch-misses # 1.32% of all branches | ||
26 | |||
27 | 0.154822978 seconds time elapsed | ||
24 | 28 | ||
25 | * | 29 | * |
26 | * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> | 30 | * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> |
27 | * | 31 | * |
28 | * Improvements and fixes by: | 32 | * Improvements and fixes by: |
29 | * | 33 | * |
@@ -75,22 +79,10 @@ static struct perf_event_attr default_attrs[] = { | |||
75 | }; | 79 | }; |
76 | 80 | ||
77 | /* | 81 | /* |
78 | * Detailed stats: | 82 | * Detailed stats (-d), covering the L1 and last level data caches: |
79 | */ | 83 | */ |
80 | static struct perf_event_attr detailed_attrs[] = { | 84 | static struct perf_event_attr detailed_attrs[] = { |
81 | 85 | ||
82 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, | ||
83 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, | ||
84 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, | ||
85 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, | ||
86 | |||
87 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, | ||
88 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, | ||
89 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, | ||
90 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, | ||
91 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, | ||
92 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, | ||
93 | |||
94 | { .type = PERF_TYPE_HW_CACHE, | 86 | { .type = PERF_TYPE_HW_CACHE, |
95 | .config = | 87 | .config = |
96 | PERF_COUNT_HW_CACHE_L1D << 0 | | 88 | PERF_COUNT_HW_CACHE_L1D << 0 | |
@@ -116,6 +108,69 @@ static struct perf_event_attr detailed_attrs[] = { | |||
116 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, | 108 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, |
117 | }; | 109 | }; |
118 | 110 | ||
111 | /* | ||
112 | * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: | ||
113 | */ | ||
114 | static struct perf_event_attr very_detailed_attrs[] = { | ||
115 | |||
116 | { .type = PERF_TYPE_HW_CACHE, | ||
117 | .config = | ||
118 | PERF_COUNT_HW_CACHE_L1I << 0 | | ||
119 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
120 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, | ||
121 | |||
122 | { .type = PERF_TYPE_HW_CACHE, | ||
123 | .config = | ||
124 | PERF_COUNT_HW_CACHE_L1I << 0 | | ||
125 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
126 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, | ||
127 | |||
128 | { .type = PERF_TYPE_HW_CACHE, | ||
129 | .config = | ||
130 | PERF_COUNT_HW_CACHE_DTLB << 0 | | ||
131 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
132 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, | ||
133 | |||
134 | { .type = PERF_TYPE_HW_CACHE, | ||
135 | .config = | ||
136 | PERF_COUNT_HW_CACHE_DTLB << 0 | | ||
137 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
138 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, | ||
139 | |||
140 | { .type = PERF_TYPE_HW_CACHE, | ||
141 | .config = | ||
142 | PERF_COUNT_HW_CACHE_ITLB << 0 | | ||
143 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
144 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, | ||
145 | |||
146 | { .type = PERF_TYPE_HW_CACHE, | ||
147 | .config = | ||
148 | PERF_COUNT_HW_CACHE_ITLB << 0 | | ||
149 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
150 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, | ||
151 | |||
152 | }; | ||
153 | |||
154 | /* | ||
155 | * Very, very detailed stats (-d -d -d), adding prefetch events: | ||
156 | */ | ||
157 | static struct perf_event_attr very_very_detailed_attrs[] = { | ||
158 | |||
159 | { .type = PERF_TYPE_HW_CACHE, | ||
160 | .config = | ||
161 | PERF_COUNT_HW_CACHE_L1D << 0 | | ||
162 | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | | ||
163 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, | ||
164 | |||
165 | { .type = PERF_TYPE_HW_CACHE, | ||
166 | .config = | ||
167 | PERF_COUNT_HW_CACHE_L1D << 0 | | ||
168 | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | | ||
169 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, | ||
170 | }; | ||
171 | |||
172 | |||
173 | |||
119 | struct perf_evlist *evsel_list; | 174 | struct perf_evlist *evsel_list; |
120 | 175 | ||
121 | static bool system_wide = false; | 176 | static bool system_wide = false; |
@@ -129,7 +184,7 @@ static pid_t target_pid = -1; | |||
129 | static pid_t target_tid = -1; | 184 | static pid_t target_tid = -1; |
130 | static pid_t child_pid = -1; | 185 | static pid_t child_pid = -1; |
131 | static bool null_run = false; | 186 | static bool null_run = false; |
132 | static bool detailed_run = false; | 187 | static int detailed_run = 0; |
133 | static bool sync_run = false; | 188 | static bool sync_run = false; |
134 | static bool big_num = true; | 189 | static bool big_num = true; |
135 | static int big_num_opt = -1; | 190 | static int big_num_opt = -1; |
@@ -464,7 +519,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) | |||
464 | { | 519 | { |
465 | double msecs = avg / 1e6; | 520 | double msecs = avg / 1e6; |
466 | char cpustr[16] = { '\0', }; | 521 | char cpustr[16] = { '\0', }; |
467 | const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s"; | 522 | const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; |
468 | 523 | ||
469 | if (no_aggr) | 524 | if (no_aggr) |
470 | sprintf(cpustr, "CPU%*d%s", | 525 | sprintf(cpustr, "CPU%*d%s", |
@@ -584,9 +639,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) | |||
584 | if (csv_output) | 639 | if (csv_output) |
585 | fmt = "%s%.0f%s%s"; | 640 | fmt = "%s%.0f%s%s"; |
586 | else if (big_num) | 641 | else if (big_num) |
587 | fmt = "%s%'18.0f%s%-24s"; | 642 | fmt = "%s%'18.0f%s%-25s"; |
588 | else | 643 | else |
589 | fmt = "%s%18.0f%s%-24s"; | 644 | fmt = "%s%18.0f%s%-25s"; |
590 | 645 | ||
591 | if (no_aggr) | 646 | if (no_aggr) |
592 | sprintf(cpustr, "CPU%*d%s", | 647 | sprintf(cpustr, "CPU%*d%s", |
@@ -616,7 +671,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) | |||
616 | 671 | ||
617 | if (total && avg) { | 672 | if (total && avg) { |
618 | ratio = total / avg; | 673 | ratio = total / avg; |
619 | fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); | 674 | fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); |
620 | } | 675 | } |
621 | 676 | ||
622 | } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && | 677 | } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && |
@@ -704,7 +759,7 @@ static void print_counter_aggr(struct perf_evsel *counter) | |||
704 | avg_enabled = avg_stats(&ps->res_stats[1]); | 759 | avg_enabled = avg_stats(&ps->res_stats[1]); |
705 | avg_running = avg_stats(&ps->res_stats[2]); | 760 | avg_running = avg_stats(&ps->res_stats[2]); |
706 | 761 | ||
707 | fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); | 762 | fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); |
708 | } | 763 | } |
709 | fprintf(stderr, "\n"); | 764 | fprintf(stderr, "\n"); |
710 | } | 765 | } |
@@ -854,7 +909,7 @@ static const struct option options[] = { | |||
854 | "repeat command and print average + stddev (max: 100)"), | 909 | "repeat command and print average + stddev (max: 100)"), |
855 | OPT_BOOLEAN('n', "null", &null_run, | 910 | OPT_BOOLEAN('n', "null", &null_run, |
856 | "null run - dont start any counters"), | 911 | "null run - dont start any counters"), |
857 | OPT_BOOLEAN('d', "detailed", &detailed_run, | 912 | OPT_INCR('d', "detailed", &detailed_run, |
858 | "detailed run - start a lot of events"), | 913 | "detailed run - start a lot of events"), |
859 | OPT_BOOLEAN('S', "sync", &sync_run, | 914 | OPT_BOOLEAN('S', "sync", &sync_run, |
860 | "call sync() before starting a run"), | 915 | "call sync() before starting a run"), |
@@ -873,6 +928,70 @@ static const struct option options[] = { | |||
873 | OPT_END() | 928 | OPT_END() |
874 | }; | 929 | }; |
875 | 930 | ||
931 | /* | ||
932 | * Add default attributes, if there were no attributes specified or | ||
933 | * if -d/--detailed, -d -d or -d -d -d is used: | ||
934 | */ | ||
935 | static int add_default_attributes(void) | ||
936 | { | ||
937 | struct perf_evsel *pos; | ||
938 | size_t attr_nr = 0; | ||
939 | size_t c; | ||
940 | |||
941 | /* Set attrs if no event is selected and !null_run: */ | ||
942 | if (null_run) | ||
943 | return 0; | ||
944 | |||
945 | if (!evsel_list->nr_entries) { | ||
946 | for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { | ||
947 | pos = perf_evsel__new(default_attrs + c, c + attr_nr); | ||
948 | if (pos == NULL) | ||
949 | return -1; | ||
950 | perf_evlist__add(evsel_list, pos); | ||
951 | } | ||
952 | attr_nr += c; | ||
953 | } | ||
954 | |||
955 | /* Detailed events get appended to the event list: */ | ||
956 | |||
957 | if (detailed_run < 1) | ||
958 | return 0; | ||
959 | |||
960 | /* Append detailed run extra attributes: */ | ||
961 | for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { | ||
962 | pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); | ||
963 | if (pos == NULL) | ||
964 | return -1; | ||
965 | perf_evlist__add(evsel_list, pos); | ||
966 | } | ||
967 | attr_nr += c; | ||
968 | |||
969 | if (detailed_run < 2) | ||
970 | return 0; | ||
971 | |||
972 | /* Append very detailed run extra attributes: */ | ||
973 | for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { | ||
974 | pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); | ||
975 | if (pos == NULL) | ||
976 | return -1; | ||
977 | perf_evlist__add(evsel_list, pos); | ||
978 | } | ||
979 | |||
980 | if (detailed_run < 3) | ||
981 | return 0; | ||
982 | |||
983 | /* Append very, very detailed run extra attributes: */ | ||
984 | for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { | ||
985 | pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); | ||
986 | if (pos == NULL) | ||
987 | return -1; | ||
988 | perf_evlist__add(evsel_list, pos); | ||
989 | } | ||
990 | |||
991 | |||
992 | return 0; | ||
993 | } | ||
994 | |||
876 | int cmd_stat(int argc, const char **argv, const char *prefix __used) | 995 | int cmd_stat(int argc, const char **argv, const char *prefix __used) |
877 | { | 996 | { |
878 | struct perf_evsel *pos; | 997 | struct perf_evsel *pos; |
@@ -918,28 +1037,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) | |||
918 | usage_with_options(stat_usage, options); | 1037 | usage_with_options(stat_usage, options); |
919 | } | 1038 | } |
920 | 1039 | ||
921 | /* Set attrs and nr_counters if no event is selected and !null_run */ | 1040 | if (add_default_attributes()) |
922 | if (detailed_run) { | 1041 | goto out; |
923 | size_t c; | ||
924 | |||
925 | for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { | ||
926 | pos = perf_evsel__new(&detailed_attrs[c], c); | ||
927 | if (pos == NULL) | ||
928 | goto out; | ||
929 | perf_evlist__add(evsel_list, pos); | ||
930 | } | ||
931 | } | ||
932 | /* Set attrs and nr_counters if no event is selected and !null_run */ | ||
933 | if (!detailed_run && !null_run && !evsel_list->nr_entries) { | ||
934 | size_t c; | ||
935 | |||
936 | for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { | ||
937 | pos = perf_evsel__new(&default_attrs[c], c); | ||
938 | if (pos == NULL) | ||
939 | goto out; | ||
940 | perf_evlist__add(evsel_list, pos); | ||
941 | } | ||
942 | } | ||
943 | 1042 | ||
944 | if (target_pid != -1) | 1043 | if (target_pid != -1) |
945 | target_tid = target_pid; | 1044 | target_tid = target_pid; |