aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-05-19 07:30:56 -0400
committerIngo Molnar <mingo@elte.hu>2011-05-19 08:29:51 -0400
commit2cba3ffb9a9db3874304a1739002d053d53c738b (patch)
treea81c614ea1be6db95c2d0d1a1e57287a631e9c6b
parentb313207286a78abac19f1dd2721292eae598b0f5 (diff)
perf stat: Add -d -d and -d -d -d options to show more CPU events
Print even more detailed statistics if requested via perf stat -d: -d: detailed events, L1 and LLC data cache -d -d: more detailed events, dTLB and iTLB events -d -d -d: very detailed events, adding prefetch events Full output looks like this now: Performance counter stats for '/home/mingo/hackbench 10' (5 runs): 1703.674707 task-clock # 8.709 CPUs utilized ( +- 4.19% ) 49,068 context-switches # 0.029 M/sec ( +- 16.66% ) 8,303 CPU-migrations # 0.005 M/sec ( +- 24.90% ) 17,397 page-faults # 0.010 M/sec ( +- 0.46% ) 2,345,389,239 cycles # 1.377 GHz ( +- 4.61% ) [55.90%] 1,884,503,527 stalled-cycles-frontend # 80.35% frontend cycles idle ( +- 5.67% ) [50.39%] 743,919,737 stalled-cycles-backend # 31.72% backend cycles idle ( +- 8.75% ) [49.91%] 1,314,416,379 instructions # 0.56 insns per cycle # 1.43 stalled cycles per insn ( +- 2.53% ) [60.87%] 272,592,567 branches # 160.003 M/sec ( +- 1.74% ) [56.56%] 3,794,846 branch-misses # 1.39% of all branches ( +- 6.59% ) [58.50%] 449,982,778 L1-dcache-loads # 264.125 M/sec ( +- 2.47% ) [49.88%] 22,404,961 L1-dcache-load-misses # 4.98% of all L1-dcache hits ( +- 6.08% ) [55.05%] 6,204,750 LLC-loads # 3.642 M/sec ( +- 8.91% ) [43.75%] 1,837,411 LLC-load-misses # 1.078 M/sec ( +- 7.27% ) [12.07%] 411,440,421 L1-icache-loads # 241.502 M/sec ( +- 5.60% ) [36.52%] 27,556,832 L1-icache-load-misses # 16.175 M/sec ( +- 7.46% ) [46.72%] 464,067,627 dTLB-loads # 272.392 M/sec ( +- 4.46% ) [54.17%] 10,765,648 dTLB-load-misses # 6.319 M/sec ( +- 3.18% ) [48.68%] 1,273,080,386 iTLB-loads # 747.256 M/sec ( +- 3.38% ) [47.53%] 117,481 iTLB-load-misses # 0.069 M/sec ( +- 14.99% ) [47.01%] 4,590,653 L1-dcache-prefetches # 2.695 M/sec ( +- 4.49% ) [46.19%] 1,712,660 L1-dcache-prefetch-misses # 1.005 M/sec ( +- 3.75% ) [44.82%] 0.195622057 seconds time elapsed ( +- 6.84% ) Also clean up the attribute construction code to be appending, and factor it out into add_default_attributes(). Tweak the coverage percentage printout a bit, so that it's easier to view it alongside the +- sttddev colum. Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/n/tip-to3kgu04449s64062val8b62@git.kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--tools/perf/builtin-stat.c209
1 files changed, 154 insertions, 55 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 602c3c96fa1e..a89fc0835367 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -6,24 +6,28 @@
6 * 6 *
7 * Sample output: 7 * Sample output:
8 8
9 $ perf stat ~/hackbench 10 9 $ perf stat ./hackbench 10
10 Time: 0.104
11 10
12 Performance counter stats for '/home/mingo/hackbench': 11 Time: 0.118
13 12
14 1255.538611 task clock ticks # 10.143 CPU utilization factor 13 Performance counter stats for './hackbench 10':
15 54011 context switches # 0.043 M/sec
16 385 CPU migrations # 0.000 M/sec
17 17755 pagefaults # 0.014 M/sec
18 3808323185 CPU cycles # 3033.219 M/sec
19 1575111190 instructions # 1254.530 M/sec
20 17367895 cache references # 13.833 M/sec
21 7674421 cache misses # 6.112 M/sec
22 14
23 Wall-clock time elapsed: 123.786620 msecs 15 1708.761321 task-clock # 11.037 CPUs utilized
16 41,190 context-switches # 0.024 M/sec
17 6,735 CPU-migrations # 0.004 M/sec
18 17,318 page-faults # 0.010 M/sec
19 5,205,202,243 cycles # 3.046 GHz
20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle
21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle
22 2,603,501,247 instructions # 0.50 insns per cycle
23 # 1.48 stalled cycles per insn
24 484,357,498 branches # 283.455 M/sec
25 6,388,934 branch-misses # 1.32% of all branches
26
27 0.154822978 seconds time elapsed
24 28
25 * 29 *
26 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
27 * 31 *
28 * Improvements and fixes by: 32 * Improvements and fixes by:
29 * 33 *
@@ -75,22 +79,10 @@ static struct perf_event_attr default_attrs[] = {
75}; 79};
76 80
77/* 81/*
78 * Detailed stats: 82 * Detailed stats (-d), covering the L1 and last level data caches:
79 */ 83 */
80static struct perf_event_attr detailed_attrs[] = { 84static struct perf_event_attr detailed_attrs[] = {
81 85
82 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
83 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES },
84 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
85 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
86
87 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
88 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
89 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
90 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
91 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
92 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
93
94 { .type = PERF_TYPE_HW_CACHE, 86 { .type = PERF_TYPE_HW_CACHE,
95 .config = 87 .config =
96 PERF_COUNT_HW_CACHE_L1D << 0 | 88 PERF_COUNT_HW_CACHE_L1D << 0 |
@@ -116,6 +108,69 @@ static struct perf_event_attr detailed_attrs[] = {
116 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 108 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
117}; 109};
118 110
111/*
112 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
113 */
114static struct perf_event_attr very_detailed_attrs[] = {
115
116 { .type = PERF_TYPE_HW_CACHE,
117 .config =
118 PERF_COUNT_HW_CACHE_L1I << 0 |
119 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
120 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
121
122 { .type = PERF_TYPE_HW_CACHE,
123 .config =
124 PERF_COUNT_HW_CACHE_L1I << 0 |
125 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
126 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
127
128 { .type = PERF_TYPE_HW_CACHE,
129 .config =
130 PERF_COUNT_HW_CACHE_DTLB << 0 |
131 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
132 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
133
134 { .type = PERF_TYPE_HW_CACHE,
135 .config =
136 PERF_COUNT_HW_CACHE_DTLB << 0 |
137 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
138 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
139
140 { .type = PERF_TYPE_HW_CACHE,
141 .config =
142 PERF_COUNT_HW_CACHE_ITLB << 0 |
143 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
144 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
145
146 { .type = PERF_TYPE_HW_CACHE,
147 .config =
148 PERF_COUNT_HW_CACHE_ITLB << 0 |
149 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
150 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
151
152};
153
154/*
155 * Very, very detailed stats (-d -d -d), adding prefetch events:
156 */
157static struct perf_event_attr very_very_detailed_attrs[] = {
158
159 { .type = PERF_TYPE_HW_CACHE,
160 .config =
161 PERF_COUNT_HW_CACHE_L1D << 0 |
162 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
163 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
164
165 { .type = PERF_TYPE_HW_CACHE,
166 .config =
167 PERF_COUNT_HW_CACHE_L1D << 0 |
168 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
169 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
170};
171
172
173
119struct perf_evlist *evsel_list; 174struct perf_evlist *evsel_list;
120 175
121static bool system_wide = false; 176static bool system_wide = false;
@@ -129,7 +184,7 @@ static pid_t target_pid = -1;
129static pid_t target_tid = -1; 184static pid_t target_tid = -1;
130static pid_t child_pid = -1; 185static pid_t child_pid = -1;
131static bool null_run = false; 186static bool null_run = false;
132static bool detailed_run = false; 187static int detailed_run = 0;
133static bool sync_run = false; 188static bool sync_run = false;
134static bool big_num = true; 189static bool big_num = true;
135static int big_num_opt = -1; 190static int big_num_opt = -1;
@@ -464,7 +519,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
464{ 519{
465 double msecs = avg / 1e6; 520 double msecs = avg / 1e6;
466 char cpustr[16] = { '\0', }; 521 char cpustr[16] = { '\0', };
467 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s"; 522 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
468 523
469 if (no_aggr) 524 if (no_aggr)
470 sprintf(cpustr, "CPU%*d%s", 525 sprintf(cpustr, "CPU%*d%s",
@@ -584,9 +639,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
584 if (csv_output) 639 if (csv_output)
585 fmt = "%s%.0f%s%s"; 640 fmt = "%s%.0f%s%s";
586 else if (big_num) 641 else if (big_num)
587 fmt = "%s%'18.0f%s%-24s"; 642 fmt = "%s%'18.0f%s%-25s";
588 else 643 else
589 fmt = "%s%18.0f%s%-24s"; 644 fmt = "%s%18.0f%s%-25s";
590 645
591 if (no_aggr) 646 if (no_aggr)
592 sprintf(cpustr, "CPU%*d%s", 647 sprintf(cpustr, "CPU%*d%s",
@@ -616,7 +671,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
616 671
617 if (total && avg) { 672 if (total && avg) {
618 ratio = total / avg; 673 ratio = total / avg;
619 fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); 674 fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio);
620 } 675 }
621 676
622 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 677 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
@@ -704,7 +759,7 @@ static void print_counter_aggr(struct perf_evsel *counter)
704 avg_enabled = avg_stats(&ps->res_stats[1]); 759 avg_enabled = avg_stats(&ps->res_stats[1]);
705 avg_running = avg_stats(&ps->res_stats[2]); 760 avg_running = avg_stats(&ps->res_stats[2]);
706 761
707 fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); 762 fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled);
708 } 763 }
709 fprintf(stderr, "\n"); 764 fprintf(stderr, "\n");
710} 765}
@@ -854,7 +909,7 @@ static const struct option options[] = {
854 "repeat command and print average + stddev (max: 100)"), 909 "repeat command and print average + stddev (max: 100)"),
855 OPT_BOOLEAN('n', "null", &null_run, 910 OPT_BOOLEAN('n', "null", &null_run,
856 "null run - dont start any counters"), 911 "null run - dont start any counters"),
857 OPT_BOOLEAN('d', "detailed", &detailed_run, 912 OPT_INCR('d', "detailed", &detailed_run,
858 "detailed run - start a lot of events"), 913 "detailed run - start a lot of events"),
859 OPT_BOOLEAN('S', "sync", &sync_run, 914 OPT_BOOLEAN('S', "sync", &sync_run,
860 "call sync() before starting a run"), 915 "call sync() before starting a run"),
@@ -873,6 +928,70 @@ static const struct option options[] = {
873 OPT_END() 928 OPT_END()
874}; 929};
875 930
931/*
932 * Add default attributes, if there were no attributes specified or
933 * if -d/--detailed, -d -d or -d -d -d is used:
934 */
935static int add_default_attributes(void)
936{
937 struct perf_evsel *pos;
938 size_t attr_nr = 0;
939 size_t c;
940
941 /* Set attrs if no event is selected and !null_run: */
942 if (null_run)
943 return 0;
944
945 if (!evsel_list->nr_entries) {
946 for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
947 pos = perf_evsel__new(default_attrs + c, c + attr_nr);
948 if (pos == NULL)
949 return -1;
950 perf_evlist__add(evsel_list, pos);
951 }
952 attr_nr += c;
953 }
954
955 /* Detailed events get appended to the event list: */
956
957 if (detailed_run < 1)
958 return 0;
959
960 /* Append detailed run extra attributes: */
961 for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
962 pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
963 if (pos == NULL)
964 return -1;
965 perf_evlist__add(evsel_list, pos);
966 }
967 attr_nr += c;
968
969 if (detailed_run < 2)
970 return 0;
971
972 /* Append very detailed run extra attributes: */
973 for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
974 pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
975 if (pos == NULL)
976 return -1;
977 perf_evlist__add(evsel_list, pos);
978 }
979
980 if (detailed_run < 3)
981 return 0;
982
983 /* Append very, very detailed run extra attributes: */
984 for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
985 pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
986 if (pos == NULL)
987 return -1;
988 perf_evlist__add(evsel_list, pos);
989 }
990
991
992 return 0;
993}
994
876int cmd_stat(int argc, const char **argv, const char *prefix __used) 995int cmd_stat(int argc, const char **argv, const char *prefix __used)
877{ 996{
878 struct perf_evsel *pos; 997 struct perf_evsel *pos;
@@ -918,28 +1037,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
918 usage_with_options(stat_usage, options); 1037 usage_with_options(stat_usage, options);
919 } 1038 }
920 1039
921 /* Set attrs and nr_counters if no event is selected and !null_run */ 1040 if (add_default_attributes())
922 if (detailed_run) { 1041 goto out;
923 size_t c;
924
925 for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) {
926 pos = perf_evsel__new(&detailed_attrs[c], c);
927 if (pos == NULL)
928 goto out;
929 perf_evlist__add(evsel_list, pos);
930 }
931 }
932 /* Set attrs and nr_counters if no event is selected and !null_run */
933 if (!detailed_run && !null_run && !evsel_list->nr_entries) {
934 size_t c;
935
936 for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
937 pos = perf_evsel__new(&default_attrs[c], c);
938 if (pos == NULL)
939 goto out;
940 perf_evlist__add(evsel_list, pos);
941 }
942 }
943 1042
944 if (target_pid != -1) 1043 if (target_pid != -1)
945 target_tid = target_pid; 1044 target_tid = target_pid;