diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-06-13 08:57:28 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-13 09:18:57 -0400 |
commit | 42202dd56c717f173cd0bf2390249e1bf5cf210b (patch) | |
tree | 3c0e09eb6a9ea978a476801f40d7070fedcf8359 | |
parent | 44175b6f397a6724121eeaf0f072e2c912a46614 (diff) |
perf stat: Add feature to run and measure a command multiple times
Add the --repeat <n> feature to perf stat, which repeats a given
command up to a 100 times, collects the stats and calculates an
average and a stddev.
For example, the following oneliner 'perf stat' command runs hackbench
5 times and prints a tabulated result of all metrics, with averages
and noise levels (in percentage) printed:
aldebaran:~/linux/linux/tools/perf> ./perf stat --repeat 5 ~/hackbench 10
Time: 0.117
Time: 0.108
Time: 0.089
Time: 0.088
Time: 0.100
Performance counter stats for '/home/mingo/hackbench 10' (5 runs):
1243.989586 task-clock-msecs # 10.460 CPUs ( +- 4.720% )
47706 context-switches # 0.038 M/sec ( +- 19.706% )
387 CPU-migrations # 0.000 M/sec ( +- 3.608% )
17793 page-faults # 0.014 M/sec ( +- 0.354% )
3770941606 cycles # 3031.329 M/sec ( +- 4.621% )
1566372416 instructions # 0.415 IPC ( +- 2.703% )
16783421 cache-references # 13.492 M/sec ( +- 5.202% )
7128590 cache-misses # 5.730 M/sec ( +- 7.420% )
0.118924455 seconds time elapsed.
The goal of this feature is to allow the reliance on these accurate
statistics and to know how many times a command has to be repeated
for the noise to go down to an acceptable level.
(The -v option can be used to see a line printed out as each run progresses.)
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | tools/perf/builtin-stat.c | 259 |
1 files changed, 194 insertions, 65 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c12804853eab..9eb42b1ae784 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include "util/parse-events.h" | 43 | #include "util/parse-events.h" |
44 | 44 | ||
45 | #include <sys/prctl.h> | 45 | #include <sys/prctl.h> |
46 | #include <math.h> | ||
46 | 47 | ||
47 | static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { | 48 | static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { |
48 | 49 | ||
@@ -79,12 +80,34 @@ static const unsigned int default_count[] = { | |||
79 | 10000, | 80 | 10000, |
80 | }; | 81 | }; |
81 | 82 | ||
82 | static __u64 event_res[MAX_COUNTERS][3]; | 83 | #define MAX_RUN 100 |
83 | static __u64 event_scaled[MAX_COUNTERS]; | ||
84 | 84 | ||
85 | static __u64 runtime_nsecs; | 85 | static int run_count = 1; |
86 | static __u64 walltime_nsecs; | 86 | static int run_idx = 0; |
87 | static __u64 runtime_cycles; | 87 | |
88 | static __u64 event_res[MAX_RUN][MAX_COUNTERS][3]; | ||
89 | static __u64 event_scaled[MAX_RUN][MAX_COUNTERS]; | ||
90 | |||
91 | //static __u64 event_hist[MAX_RUN][MAX_COUNTERS][3]; | ||
92 | |||
93 | |||
94 | static __u64 runtime_nsecs[MAX_RUN]; | ||
95 | static __u64 walltime_nsecs[MAX_RUN]; | ||
96 | static __u64 runtime_cycles[MAX_RUN]; | ||
97 | |||
98 | static __u64 event_res_avg[MAX_COUNTERS][3]; | ||
99 | static __u64 event_res_noise[MAX_COUNTERS][3]; | ||
100 | |||
101 | static __u64 event_scaled_avg[MAX_COUNTERS]; | ||
102 | |||
103 | static __u64 runtime_nsecs_avg; | ||
104 | static __u64 runtime_nsecs_noise; | ||
105 | |||
106 | static __u64 walltime_nsecs_avg; | ||
107 | static __u64 walltime_nsecs_noise; | ||
108 | |||
109 | static __u64 runtime_cycles_avg; | ||
110 | static __u64 runtime_cycles_noise; | ||
88 | 111 | ||
89 | static void create_perf_stat_counter(int counter) | 112 | static void create_perf_stat_counter(int counter) |
90 | { | 113 | { |
@@ -140,7 +163,7 @@ static void read_counter(int counter) | |||
140 | int cpu, nv; | 163 | int cpu, nv; |
141 | int scaled; | 164 | int scaled; |
142 | 165 | ||
143 | count = event_res[counter]; | 166 | count = event_res[run_idx][counter]; |
144 | 167 | ||
145 | count[0] = count[1] = count[2] = 0; | 168 | count[0] = count[1] = count[2] = 0; |
146 | 169 | ||
@@ -151,6 +174,8 @@ static void read_counter(int counter) | |||
151 | 174 | ||
152 | res = read(fd[cpu][counter], single_count, nv * sizeof(__u64)); | 175 | res = read(fd[cpu][counter], single_count, nv * sizeof(__u64)); |
153 | assert(res == nv * sizeof(__u64)); | 176 | assert(res == nv * sizeof(__u64)); |
177 | close(fd[cpu][counter]); | ||
178 | fd[cpu][counter] = -1; | ||
154 | 179 | ||
155 | count[0] += single_count[0]; | 180 | count[0] += single_count[0]; |
156 | if (scale) { | 181 | if (scale) { |
@@ -162,13 +187,13 @@ static void read_counter(int counter) | |||
162 | scaled = 0; | 187 | scaled = 0; |
163 | if (scale) { | 188 | if (scale) { |
164 | if (count[2] == 0) { | 189 | if (count[2] == 0) { |
165 | event_scaled[counter] = -1; | 190 | event_scaled[run_idx][counter] = -1; |
166 | count[0] = 0; | 191 | count[0] = 0; |
167 | return; | 192 | return; |
168 | } | 193 | } |
169 | 194 | ||
170 | if (count[2] < count[1]) { | 195 | if (count[2] < count[1]) { |
171 | event_scaled[counter] = 1; | 196 | event_scaled[run_idx][counter] = 1; |
172 | count[0] = (unsigned long long) | 197 | count[0] = (unsigned long long) |
173 | ((double)count[0] * count[1] / count[2] + 0.5); | 198 | ((double)count[0] * count[1] / count[2] + 0.5); |
174 | } | 199 | } |
@@ -178,13 +203,62 @@ static void read_counter(int counter) | |||
178 | */ | 203 | */ |
179 | if (attrs[counter].type == PERF_TYPE_SOFTWARE && | 204 | if (attrs[counter].type == PERF_TYPE_SOFTWARE && |
180 | attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) | 205 | attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) |
181 | runtime_nsecs = count[0]; | 206 | runtime_nsecs[run_idx] = count[0]; |
182 | if (attrs[counter].type == PERF_TYPE_HARDWARE && | 207 | if (attrs[counter].type == PERF_TYPE_HARDWARE && |
183 | attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES) | 208 | attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES) |
184 | runtime_cycles = count[0]; | 209 | runtime_cycles[run_idx] = count[0]; |
185 | } | 210 | } |
186 | 211 | ||
187 | static void nsec_printout(int counter, __u64 *count) | 212 | static int run_perf_stat(int argc, const char **argv) |
213 | { | ||
214 | unsigned long long t0, t1; | ||
215 | int status = 0; | ||
216 | int counter; | ||
217 | int pid; | ||
218 | |||
219 | if (!system_wide) | ||
220 | nr_cpus = 1; | ||
221 | |||
222 | for (counter = 0; counter < nr_counters; counter++) | ||
223 | create_perf_stat_counter(counter); | ||
224 | |||
225 | /* | ||
226 | * Enable counters and exec the command: | ||
227 | */ | ||
228 | t0 = rdclock(); | ||
229 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
230 | |||
231 | if ((pid = fork()) < 0) | ||
232 | perror("failed to fork"); | ||
233 | |||
234 | if (!pid) { | ||
235 | if (execvp(argv[0], (char **)argv)) { | ||
236 | perror(argv[0]); | ||
237 | exit(-1); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | wait(&status); | ||
242 | |||
243 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | ||
244 | t1 = rdclock(); | ||
245 | |||
246 | walltime_nsecs[run_idx] = t1 - t0; | ||
247 | |||
248 | for (counter = 0; counter < nr_counters; counter++) | ||
249 | read_counter(counter); | ||
250 | |||
251 | return WEXITSTATUS(status); | ||
252 | } | ||
253 | |||
254 | static void print_noise(__u64 *count, __u64 *noise) | ||
255 | { | ||
256 | if (run_count > 1) | ||
257 | fprintf(stderr, " ( +- %7.3f%% )", | ||
258 | (double)noise[0]/(count[0]+1)*100.0); | ||
259 | } | ||
260 | |||
261 | static void nsec_printout(int counter, __u64 *count, __u64 *noise) | ||
188 | { | 262 | { |
189 | double msecs = (double)count[0] / 1000000; | 263 | double msecs = (double)count[0] / 1000000; |
190 | 264 | ||
@@ -193,29 +267,30 @@ static void nsec_printout(int counter, __u64 *count) | |||
193 | if (attrs[counter].type == PERF_TYPE_SOFTWARE && | 267 | if (attrs[counter].type == PERF_TYPE_SOFTWARE && |
194 | attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { | 268 | attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { |
195 | 269 | ||
196 | if (walltime_nsecs) | 270 | if (walltime_nsecs_avg) |
197 | fprintf(stderr, " # %10.3f CPUs", | 271 | fprintf(stderr, " # %10.3f CPUs ", |
198 | (double)count[0] / (double)walltime_nsecs); | 272 | (double)count[0] / (double)walltime_nsecs_avg); |
199 | } | 273 | } |
274 | print_noise(count, noise); | ||
200 | } | 275 | } |
201 | 276 | ||
202 | static void abs_printout(int counter, __u64 *count) | 277 | static void abs_printout(int counter, __u64 *count, __u64 *noise) |
203 | { | 278 | { |
204 | fprintf(stderr, " %14Ld %-20s", count[0], event_name(counter)); | 279 | fprintf(stderr, " %14Ld %-20s", count[0], event_name(counter)); |
205 | 280 | ||
206 | if (runtime_cycles && | 281 | if (runtime_cycles_avg && |
207 | attrs[counter].type == PERF_TYPE_HARDWARE && | 282 | attrs[counter].type == PERF_TYPE_HARDWARE && |
208 | attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { | 283 | attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { |
209 | 284 | ||
210 | fprintf(stderr, " # %10.3f IPC", | 285 | fprintf(stderr, " # %10.3f IPC ", |
211 | (double)count[0] / (double)runtime_cycles); | 286 | (double)count[0] / (double)runtime_cycles_avg); |
212 | 287 | } else { | |
213 | return; | 288 | if (runtime_nsecs_avg) { |
289 | fprintf(stderr, " # %10.3f M/sec", | ||
290 | (double)count[0]/runtime_nsecs_avg*1000.0); | ||
291 | } | ||
214 | } | 292 | } |
215 | 293 | print_noise(count, noise); | |
216 | if (runtime_nsecs) | ||
217 | fprintf(stderr, " # %10.3f M/sec", | ||
218 | (double)count[0]/runtime_nsecs*1000.0); | ||
219 | } | 294 | } |
220 | 295 | ||
221 | /* | 296 | /* |
@@ -223,11 +298,12 @@ static void abs_printout(int counter, __u64 *count) | |||
223 | */ | 298 | */ |
224 | static void print_counter(int counter) | 299 | static void print_counter(int counter) |
225 | { | 300 | { |
226 | __u64 *count; | 301 | __u64 *count, *noise; |
227 | int scaled; | 302 | int scaled; |
228 | 303 | ||
229 | count = event_res[counter]; | 304 | count = event_res_avg[counter]; |
230 | scaled = event_scaled[counter]; | 305 | noise = event_res_noise[counter]; |
306 | scaled = event_scaled_avg[counter]; | ||
231 | 307 | ||
232 | if (scaled == -1) { | 308 | if (scaled == -1) { |
233 | fprintf(stderr, " %14s %-20s\n", | 309 | fprintf(stderr, " %14s %-20s\n", |
@@ -236,9 +312,9 @@ static void print_counter(int counter) | |||
236 | } | 312 | } |
237 | 313 | ||
238 | if (nsec_counter(counter)) | 314 | if (nsec_counter(counter)) |
239 | nsec_printout(counter, count); | 315 | nsec_printout(counter, count, noise); |
240 | else | 316 | else |
241 | abs_printout(counter, count); | 317 | abs_printout(counter, count, noise); |
242 | 318 | ||
243 | if (scaled) | 319 | if (scaled) |
244 | fprintf(stderr, " (scaled from %.2f%%)", | 320 | fprintf(stderr, " (scaled from %.2f%%)", |
@@ -247,43 +323,83 @@ static void print_counter(int counter) | |||
247 | fprintf(stderr, "\n"); | 323 | fprintf(stderr, "\n"); |
248 | } | 324 | } |
249 | 325 | ||
250 | static int do_perf_stat(int argc, const char **argv) | 326 | /* |
327 | * Normalize noise values down to stddev: | ||
328 | */ | ||
329 | static void normalize(__u64 *val) | ||
251 | { | 330 | { |
252 | unsigned long long t0, t1; | 331 | double res; |
253 | int counter; | ||
254 | int status; | ||
255 | int pid; | ||
256 | int i; | ||
257 | |||
258 | if (!system_wide) | ||
259 | nr_cpus = 1; | ||
260 | 332 | ||
261 | for (counter = 0; counter < nr_counters; counter++) | 333 | res = (double)*val / (run_count * sqrt((double)run_count)); |
262 | create_perf_stat_counter(counter); | ||
263 | 334 | ||
264 | /* | 335 | *val = (__u64)res; |
265 | * Enable counters and exec the command: | 336 | } |
266 | */ | ||
267 | t0 = rdclock(); | ||
268 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
269 | 337 | ||
270 | if ((pid = fork()) < 0) | 338 | /* |
271 | perror("failed to fork"); | 339 | * Calculate the averages and noises: |
340 | */ | ||
341 | static void calc_avg(void) | ||
342 | { | ||
343 | int i, j; | ||
344 | |||
345 | for (i = 0; i < run_count; i++) { | ||
346 | runtime_nsecs_avg += runtime_nsecs[i]; | ||
347 | walltime_nsecs_avg += walltime_nsecs[i]; | ||
348 | runtime_cycles_avg += runtime_cycles[i]; | ||
349 | |||
350 | for (j = 0; j < nr_counters; j++) { | ||
351 | event_res_avg[j][0] += event_res[i][j][0]; | ||
352 | event_res_avg[j][1] += event_res[i][j][1]; | ||
353 | event_res_avg[j][2] += event_res[i][j][2]; | ||
354 | event_scaled_avg[j] += event_scaled[i][j]; | ||
355 | } | ||
356 | } | ||
357 | runtime_nsecs_avg /= run_count; | ||
358 | walltime_nsecs_avg /= run_count; | ||
359 | runtime_cycles_avg /= run_count; | ||
360 | |||
361 | for (j = 0; j < nr_counters; j++) { | ||
362 | event_res_avg[j][0] /= run_count; | ||
363 | event_res_avg[j][1] /= run_count; | ||
364 | event_res_avg[j][2] /= run_count; | ||
365 | } | ||
272 | 366 | ||
273 | if (!pid) { | 367 | for (i = 0; i < run_count; i++) { |
274 | if (execvp(argv[0], (char **)argv)) { | 368 | runtime_nsecs_noise += |
275 | perror(argv[0]); | 369 | abs((__s64)(runtime_nsecs[i] - runtime_nsecs_avg)); |
276 | exit(-1); | 370 | walltime_nsecs_noise += |
371 | abs((__s64)(walltime_nsecs[i] - walltime_nsecs_avg)); | ||
372 | runtime_cycles_noise += | ||
373 | abs((__s64)(runtime_cycles[i] - runtime_cycles_avg)); | ||
374 | |||
375 | for (j = 0; j < nr_counters; j++) { | ||
376 | event_res_noise[j][0] += | ||
377 | abs((__s64)(event_res[i][j][0] - event_res_avg[j][0])); | ||
378 | event_res_noise[j][1] += | ||
379 | abs((__s64)(event_res[i][j][1] - event_res_avg[j][1])); | ||
380 | event_res_noise[j][2] += | ||
381 | abs((__s64)(event_res[i][j][2] - event_res_avg[j][2])); | ||
277 | } | 382 | } |
278 | } | 383 | } |
279 | 384 | ||
280 | while (wait(&status) >= 0) | 385 | normalize(&runtime_nsecs_noise); |
281 | ; | 386 | normalize(&walltime_nsecs_noise); |
387 | normalize(&runtime_cycles_noise); | ||
282 | 388 | ||
283 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | 389 | for (j = 0; j < nr_counters; j++) { |
284 | t1 = rdclock(); | 390 | normalize(&event_res_noise[j][0]); |
391 | normalize(&event_res_noise[j][1]); | ||
392 | normalize(&event_res_noise[j][2]); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | static void print_stat(int argc, const char **argv) | ||
397 | { | ||
398 | int i, counter; | ||
399 | |||
400 | calc_avg(); | ||
285 | 401 | ||
286 | walltime_nsecs = t1 - t0; | 402 | run_idx = 0; |
287 | 403 | ||
288 | fflush(stdout); | 404 | fflush(stdout); |
289 | 405 | ||
@@ -293,21 +409,19 @@ static int do_perf_stat(int argc, const char **argv) | |||
293 | for (i = 1; i < argc; i++) | 409 | for (i = 1; i < argc; i++) |
294 | fprintf(stderr, " %s", argv[i]); | 410 | fprintf(stderr, " %s", argv[i]); |
295 | 411 | ||
296 | fprintf(stderr, "\':\n"); | 412 | fprintf(stderr, "\'"); |
297 | fprintf(stderr, "\n"); | 413 | if (run_count > 1) |
298 | 414 | fprintf(stderr, " (%d runs)", run_count); | |
299 | for (counter = 0; counter < nr_counters; counter++) | 415 | fprintf(stderr, ":\n\n"); |
300 | read_counter(counter); | ||
301 | 416 | ||
302 | for (counter = 0; counter < nr_counters; counter++) | 417 | for (counter = 0; counter < nr_counters; counter++) |
303 | print_counter(counter); | 418 | print_counter(counter); |
304 | 419 | ||
305 | 420 | ||
306 | fprintf(stderr, "\n"); | 421 | fprintf(stderr, "\n"); |
307 | fprintf(stderr, " %14.9f seconds time elapsed.\n", (double)(t1-t0)/1e9); | 422 | fprintf(stderr, " %14.9f seconds time elapsed.\n", |
423 | (double)walltime_nsecs_avg/1e9); | ||
308 | fprintf(stderr, "\n"); | 424 | fprintf(stderr, "\n"); |
309 | |||
310 | return 0; | ||
311 | } | 425 | } |
312 | 426 | ||
313 | static volatile int signr = -1; | 427 | static volatile int signr = -1; |
@@ -345,11 +459,15 @@ static const struct option options[] = { | |||
345 | "scale/normalize counters"), | 459 | "scale/normalize counters"), |
346 | OPT_BOOLEAN('v', "verbose", &verbose, | 460 | OPT_BOOLEAN('v', "verbose", &verbose, |
347 | "be more verbose (show counter open errors, etc)"), | 461 | "be more verbose (show counter open errors, etc)"), |
462 | OPT_INTEGER('r', "repeat", &run_count, | ||
463 | "repeat command and print average + stddev (max: 100)"), | ||
348 | OPT_END() | 464 | OPT_END() |
349 | }; | 465 | }; |
350 | 466 | ||
351 | int cmd_stat(int argc, const char **argv, const char *prefix) | 467 | int cmd_stat(int argc, const char **argv, const char *prefix) |
352 | { | 468 | { |
469 | int status; | ||
470 | |||
353 | page_size = sysconf(_SC_PAGE_SIZE); | 471 | page_size = sysconf(_SC_PAGE_SIZE); |
354 | 472 | ||
355 | memcpy(attrs, default_attrs, sizeof(attrs)); | 473 | memcpy(attrs, default_attrs, sizeof(attrs)); |
@@ -357,6 +475,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix) | |||
357 | argc = parse_options(argc, argv, options, stat_usage, 0); | 475 | argc = parse_options(argc, argv, options, stat_usage, 0); |
358 | if (!argc) | 476 | if (!argc) |
359 | usage_with_options(stat_usage, options); | 477 | usage_with_options(stat_usage, options); |
478 | if (run_count <= 0 || run_count > MAX_RUN) | ||
479 | usage_with_options(stat_usage, options); | ||
360 | 480 | ||
361 | if (!nr_counters) | 481 | if (!nr_counters) |
362 | nr_counters = 8; | 482 | nr_counters = 8; |
@@ -376,5 +496,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix) | |||
376 | signal(SIGALRM, skip_signal); | 496 | signal(SIGALRM, skip_signal); |
377 | signal(SIGABRT, skip_signal); | 497 | signal(SIGABRT, skip_signal); |
378 | 498 | ||
379 | return do_perf_stat(argc, argv); | 499 | status = 0; |
500 | for (run_idx = 0; run_idx < run_count; run_idx++) { | ||
501 | if (run_count != 1 && verbose) | ||
502 | fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx+1); | ||
503 | status = run_perf_stat(argc, argv); | ||
504 | } | ||
505 | |||
506 | print_stat(argc, argv); | ||
507 | |||
508 | return status; | ||
380 | } | 509 | } |