diff options
author | Stephane Eranian <eranian@google.com> | 2010-11-16 04:05:01 -0500 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2010-11-19 13:16:53 -0500 |
commit | f5b4a9c3ab53d544a540a6f3a5d17184e374d91a (patch) | |
tree | 26050a7485c92938f3a366857d443cf4779a6a39 /tools/perf/builtin-stat.c | |
parent | ae51ce9061b1ddc0fde363913c932bee5b9bc5fd (diff) |
perf stat: Add no-aggregation mode to -a
This patch adds a new -A option to perf stat. If specified then perf stat does
not aggregate counts across all monitored CPUs in system-wide mode, i.e., when
using -a. This option is not supported in per-thread mode.
Being able to get a per-cpu breakdown is useful to detect imbalances between
CPUs when running a uniform workload than spans all monitored CPUs.
The second version corrects the missing cpumap[] support, so that it works when
the -C option is used.
The third version fixes a missing cpumap[] in print_counter() and removes a
stray patch in builtin-trace.c.
Examples on a 4-way system:
# perf stat -a -e cycles,instructions -- sleep 1
Performance counter stats for 'sleep 1':
9592808135 cycles
3490380006 instructions # 0.364 IPC
1.001584632 seconds time elapsed
# perf stat -a -A -e cycles,instructions -- sleep 1
Performance counter stats for 'sleep 1':
CPU0 2398163767 cycles
CPU1 2398180817 cycles
CPU2 2398217115 cycles
CPU3 2398247483 cycles
CPU0 872282046 instructions # 0.364 IPC
CPU1 873481776 instructions # 0.364 IPC
CPU2 872638127 instructions # 0.364 IPC
CPU3 872437789 instructions # 0.364 IPC
1.001556052 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <4ce257b5.1e07e30a.7b6b.3aa9@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r-- | tools/perf/builtin-stat.c | 169 |
1 files changed, 144 insertions, 25 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a6b4d44f9502..b3e568ffad27 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c | |||
@@ -75,6 +75,7 @@ static int run_idx = 0; | |||
75 | static int run_count = 1; | 75 | static int run_count = 1; |
76 | static bool no_inherit = false; | 76 | static bool no_inherit = false; |
77 | static bool scale = true; | 77 | static bool scale = true; |
78 | static bool no_aggr = false; | ||
78 | static pid_t target_pid = -1; | 79 | static pid_t target_pid = -1; |
79 | static pid_t target_tid = -1; | 80 | static pid_t target_tid = -1; |
80 | static pid_t *all_tids = NULL; | 81 | static pid_t *all_tids = NULL; |
@@ -89,6 +90,12 @@ static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; | |||
89 | 90 | ||
90 | static int event_scaled[MAX_COUNTERS]; | 91 | static int event_scaled[MAX_COUNTERS]; |
91 | 92 | ||
93 | static struct { | ||
94 | u64 val; | ||
95 | u64 ena; | ||
96 | u64 run; | ||
97 | } cpu_counts[MAX_NR_CPUS][MAX_COUNTERS]; | ||
98 | |||
92 | static volatile int done = 0; | 99 | static volatile int done = 0; |
93 | 100 | ||
94 | struct stats | 101 | struct stats |
@@ -136,10 +143,10 @@ static double stddev_stats(struct stats *stats) | |||
136 | } | 143 | } |
137 | 144 | ||
138 | struct stats event_res_stats[MAX_COUNTERS][3]; | 145 | struct stats event_res_stats[MAX_COUNTERS][3]; |
139 | struct stats runtime_nsecs_stats; | 146 | struct stats runtime_nsecs_stats[MAX_NR_CPUS]; |
147 | struct stats runtime_cycles_stats[MAX_NR_CPUS]; | ||
148 | struct stats runtime_branches_stats[MAX_NR_CPUS]; | ||
140 | struct stats walltime_nsecs_stats; | 149 | struct stats walltime_nsecs_stats; |
141 | struct stats runtime_cycles_stats; | ||
142 | struct stats runtime_branches_stats; | ||
143 | 150 | ||
144 | #define MATCH_EVENT(t, c, counter) \ | 151 | #define MATCH_EVENT(t, c, counter) \ |
145 | (attrs[counter].type == PERF_TYPE_##t && \ | 152 | (attrs[counter].type == PERF_TYPE_##t && \ |
@@ -205,8 +212,9 @@ static inline int nsec_counter(int counter) | |||
205 | 212 | ||
206 | /* | 213 | /* |
207 | * Read out the results of a single counter: | 214 | * Read out the results of a single counter: |
215 | * aggregate counts across CPUs in system-wide mode | ||
208 | */ | 216 | */ |
209 | static void read_counter(int counter) | 217 | static void read_counter_aggr(int counter) |
210 | { | 218 | { |
211 | u64 count[3], single_count[3]; | 219 | u64 count[3], single_count[3]; |
212 | int cpu; | 220 | int cpu; |
@@ -264,11 +272,58 @@ static void read_counter(int counter) | |||
264 | * Save the full runtime - to allow normalization during printout: | 272 | * Save the full runtime - to allow normalization during printout: |
265 | */ | 273 | */ |
266 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) | 274 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) |
267 | update_stats(&runtime_nsecs_stats, count[0]); | 275 | update_stats(&runtime_nsecs_stats[0], count[0]); |
268 | if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) | 276 | if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) |
269 | update_stats(&runtime_cycles_stats, count[0]); | 277 | update_stats(&runtime_cycles_stats[0], count[0]); |
270 | if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) | 278 | if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) |
271 | update_stats(&runtime_branches_stats, count[0]); | 279 | update_stats(&runtime_branches_stats[0], count[0]); |
280 | } | ||
281 | |||
282 | /* | ||
283 | * Read out the results of a single counter: | ||
284 | * do not aggregate counts across CPUs in system-wide mode | ||
285 | */ | ||
286 | static void read_counter(int counter) | ||
287 | { | ||
288 | u64 count[3]; | ||
289 | int cpu; | ||
290 | size_t res, nv; | ||
291 | |||
292 | count[0] = count[1] = count[2] = 0; | ||
293 | |||
294 | nv = scale ? 3 : 1; | ||
295 | |||
296 | for (cpu = 0; cpu < nr_cpus; cpu++) { | ||
297 | |||
298 | if (fd[cpu][counter][0] < 0) | ||
299 | continue; | ||
300 | |||
301 | res = read(fd[cpu][counter][0], count, nv * sizeof(u64)); | ||
302 | |||
303 | assert(res == nv * sizeof(u64)); | ||
304 | |||
305 | close(fd[cpu][counter][0]); | ||
306 | fd[cpu][counter][0] = -1; | ||
307 | |||
308 | if (scale) { | ||
309 | if (count[2] == 0) { | ||
310 | count[0] = 0; | ||
311 | } else if (count[2] < count[1]) { | ||
312 | count[0] = (unsigned long long) | ||
313 | ((double)count[0] * count[1] / count[2] + 0.5); | ||
314 | } | ||
315 | } | ||
316 | cpu_counts[cpu][counter].val = count[0]; /* scaled count */ | ||
317 | cpu_counts[cpu][counter].ena = count[1]; | ||
318 | cpu_counts[cpu][counter].run = count[2]; | ||
319 | |||
320 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) | ||
321 | update_stats(&runtime_nsecs_stats[cpu], count[0]); | ||
322 | if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) | ||
323 | update_stats(&runtime_cycles_stats[cpu], count[0]); | ||
324 | if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) | ||
325 | update_stats(&runtime_branches_stats[cpu], count[0]); | ||
326 | } | ||
272 | } | 327 | } |
273 | 328 | ||
274 | static int run_perf_stat(int argc __used, const char **argv) | 329 | static int run_perf_stat(int argc __used, const char **argv) |
@@ -362,9 +417,13 @@ static int run_perf_stat(int argc __used, const char **argv) | |||
362 | 417 | ||
363 | update_stats(&walltime_nsecs_stats, t1 - t0); | 418 | update_stats(&walltime_nsecs_stats, t1 - t0); |
364 | 419 | ||
365 | for (counter = 0; counter < nr_counters; counter++) | 420 | if (no_aggr) { |
366 | read_counter(counter); | 421 | for (counter = 0; counter < nr_counters; counter++) |
367 | 422 | read_counter(counter); | |
423 | } else { | ||
424 | for (counter = 0; counter < nr_counters; counter++) | ||
425 | read_counter_aggr(counter); | ||
426 | } | ||
368 | return WEXITSTATUS(status); | 427 | return WEXITSTATUS(status); |
369 | } | 428 | } |
370 | 429 | ||
@@ -377,11 +436,15 @@ static void print_noise(int counter, double avg) | |||
377 | 100 * stddev_stats(&event_res_stats[counter][0]) / avg); | 436 | 100 * stddev_stats(&event_res_stats[counter][0]) / avg); |
378 | } | 437 | } |
379 | 438 | ||
380 | static void nsec_printout(int counter, double avg) | 439 | static void nsec_printout(int cpu, int counter, double avg) |
381 | { | 440 | { |
382 | double msecs = avg / 1e6; | 441 | double msecs = avg / 1e6; |
383 | 442 | ||
384 | fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); | 443 | if (no_aggr) |
444 | fprintf(stderr, "CPU%-4d %18.6f %-24s", | ||
445 | cpumap[cpu], msecs, event_name(counter)); | ||
446 | else | ||
447 | fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); | ||
385 | 448 | ||
386 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { | 449 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { |
387 | fprintf(stderr, " # %10.3f CPUs ", | 450 | fprintf(stderr, " # %10.3f CPUs ", |
@@ -389,33 +452,41 @@ static void nsec_printout(int counter, double avg) | |||
389 | } | 452 | } |
390 | } | 453 | } |
391 | 454 | ||
392 | static void abs_printout(int counter, double avg) | 455 | static void abs_printout(int cpu, int counter, double avg) |
393 | { | 456 | { |
394 | double total, ratio = 0.0; | 457 | double total, ratio = 0.0; |
458 | char cpustr[16] = { '\0', }; | ||
459 | |||
460 | if (no_aggr) | ||
461 | sprintf(cpustr, "CPU%-4d", cpumap[cpu]); | ||
462 | else | ||
463 | cpu = 0; | ||
395 | 464 | ||
396 | if (big_num) | 465 | if (big_num) |
397 | fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); | 466 | fprintf(stderr, "%s %'18.0f %-24s", |
467 | cpustr, avg, event_name(counter)); | ||
398 | else | 468 | else |
399 | fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); | 469 | fprintf(stderr, "%s %18.0f %-24s", |
470 | cpustr, avg, event_name(counter)); | ||
400 | 471 | ||
401 | if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { | 472 | if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { |
402 | total = avg_stats(&runtime_cycles_stats); | 473 | total = avg_stats(&runtime_cycles_stats[cpu]); |
403 | 474 | ||
404 | if (total) | 475 | if (total) |
405 | ratio = avg / total; | 476 | ratio = avg / total; |
406 | 477 | ||
407 | fprintf(stderr, " # %10.3f IPC ", ratio); | 478 | fprintf(stderr, " # %10.3f IPC ", ratio); |
408 | } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && | 479 | } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && |
409 | runtime_branches_stats.n != 0) { | 480 | runtime_branches_stats[cpu].n != 0) { |
410 | total = avg_stats(&runtime_branches_stats); | 481 | total = avg_stats(&runtime_branches_stats[cpu]); |
411 | 482 | ||
412 | if (total) | 483 | if (total) |
413 | ratio = avg * 100 / total; | 484 | ratio = avg * 100 / total; |
414 | 485 | ||
415 | fprintf(stderr, " # %10.3f %% ", ratio); | 486 | fprintf(stderr, " # %10.3f %% ", ratio); |
416 | 487 | ||
417 | } else if (runtime_nsecs_stats.n != 0) { | 488 | } else if (runtime_nsecs_stats[cpu].n != 0) { |
418 | total = avg_stats(&runtime_nsecs_stats); | 489 | total = avg_stats(&runtime_nsecs_stats[cpu]); |
419 | 490 | ||
420 | if (total) | 491 | if (total) |
421 | ratio = 1000.0 * avg / total; | 492 | ratio = 1000.0 * avg / total; |
@@ -426,8 +497,9 @@ static void abs_printout(int counter, double avg) | |||
426 | 497 | ||
427 | /* | 498 | /* |
428 | * Print out the results of a single counter: | 499 | * Print out the results of a single counter: |
500 | * aggregated counts in system-wide mode | ||
429 | */ | 501 | */ |
430 | static void print_counter(int counter) | 502 | static void print_counter_aggr(int counter) |
431 | { | 503 | { |
432 | double avg = avg_stats(&event_res_stats[counter][0]); | 504 | double avg = avg_stats(&event_res_stats[counter][0]); |
433 | int scaled = event_scaled[counter]; | 505 | int scaled = event_scaled[counter]; |
@@ -439,9 +511,9 @@ static void print_counter(int counter) | |||
439 | } | 511 | } |
440 | 512 | ||
441 | if (nsec_counter(counter)) | 513 | if (nsec_counter(counter)) |
442 | nsec_printout(counter, avg); | 514 | nsec_printout(-1, counter, avg); |
443 | else | 515 | else |
444 | abs_printout(counter, avg); | 516 | abs_printout(-1, counter, avg); |
445 | 517 | ||
446 | print_noise(counter, avg); | 518 | print_noise(counter, avg); |
447 | 519 | ||
@@ -458,6 +530,42 @@ static void print_counter(int counter) | |||
458 | fprintf(stderr, "\n"); | 530 | fprintf(stderr, "\n"); |
459 | } | 531 | } |
460 | 532 | ||
533 | /* | ||
534 | * Print out the results of a single counter: | ||
535 | * does not use aggregated count in system-wide | ||
536 | */ | ||
537 | static void print_counter(int counter) | ||
538 | { | ||
539 | u64 ena, run, val; | ||
540 | int cpu; | ||
541 | |||
542 | for (cpu = 0; cpu < nr_cpus; cpu++) { | ||
543 | val = cpu_counts[cpu][counter].val; | ||
544 | ena = cpu_counts[cpu][counter].ena; | ||
545 | run = cpu_counts[cpu][counter].run; | ||
546 | if (run == 0 || ena == 0) { | ||
547 | fprintf(stderr, "CPU%-4d %18s %-24s", cpumap[cpu], | ||
548 | "<not counted>", event_name(counter)); | ||
549 | |||
550 | fprintf(stderr, "\n"); | ||
551 | continue; | ||
552 | } | ||
553 | |||
554 | if (nsec_counter(counter)) | ||
555 | nsec_printout(cpu, counter, val); | ||
556 | else | ||
557 | abs_printout(cpu, counter, val); | ||
558 | |||
559 | print_noise(counter, 1.0); | ||
560 | |||
561 | if (run != ena) { | ||
562 | fprintf(stderr, " (scaled from %.2f%%)", | ||
563 | 100.0 * run / ena); | ||
564 | } | ||
565 | fprintf(stderr, "\n"); | ||
566 | } | ||
567 | } | ||
568 | |||
461 | static void print_stat(int argc, const char **argv) | 569 | static void print_stat(int argc, const char **argv) |
462 | { | 570 | { |
463 | int i, counter; | 571 | int i, counter; |
@@ -480,8 +588,13 @@ static void print_stat(int argc, const char **argv) | |||
480 | fprintf(stderr, " (%d runs)", run_count); | 588 | fprintf(stderr, " (%d runs)", run_count); |
481 | fprintf(stderr, ":\n\n"); | 589 | fprintf(stderr, ":\n\n"); |
482 | 590 | ||
483 | for (counter = 0; counter < nr_counters; counter++) | 591 | if (no_aggr) { |
484 | print_counter(counter); | 592 | for (counter = 0; counter < nr_counters; counter++) |
593 | print_counter(counter); | ||
594 | } else { | ||
595 | for (counter = 0; counter < nr_counters; counter++) | ||
596 | print_counter_aggr(counter); | ||
597 | } | ||
485 | 598 | ||
486 | fprintf(stderr, "\n"); | 599 | fprintf(stderr, "\n"); |
487 | fprintf(stderr, " %18.9f seconds time elapsed", | 600 | fprintf(stderr, " %18.9f seconds time elapsed", |
@@ -545,6 +658,8 @@ static const struct option options[] = { | |||
545 | "print large numbers with thousands\' separators"), | 658 | "print large numbers with thousands\' separators"), |
546 | OPT_STRING('C', "cpu", &cpu_list, "cpu", | 659 | OPT_STRING('C', "cpu", &cpu_list, "cpu", |
547 | "list of cpus to monitor in system-wide"), | 660 | "list of cpus to monitor in system-wide"), |
661 | OPT_BOOLEAN('A', "no-aggr", &no_aggr, | ||
662 | "disable CPU count aggregation"), | ||
548 | OPT_END() | 663 | OPT_END() |
549 | }; | 664 | }; |
550 | 665 | ||
@@ -562,6 +677,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) | |||
562 | if (run_count <= 0) | 677 | if (run_count <= 0) |
563 | usage_with_options(stat_usage, options); | 678 | usage_with_options(stat_usage, options); |
564 | 679 | ||
680 | /* no_aggr is for system-wide only */ | ||
681 | if (no_aggr && !system_wide) | ||
682 | usage_with_options(stat_usage, options); | ||
683 | |||
565 | /* Set attrs and nr_counters if no event is selected and !null_run */ | 684 | /* Set attrs and nr_counters if no event is selected and !null_run */ |
566 | if (!null_run && !nr_counters) { | 685 | if (!null_run && !nr_counters) { |
567 | memcpy(attrs, default_attrs, sizeof(default_attrs)); | 686 | memcpy(attrs, default_attrs, sizeof(default_attrs)); |