aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/builtin-stat.c
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2010-11-16 04:05:01 -0500
committerArnaldo Carvalho de Melo <acme@redhat.com>2010-11-19 13:16:53 -0500
commitf5b4a9c3ab53d544a540a6f3a5d17184e374d91a (patch)
tree26050a7485c92938f3a366857d443cf4779a6a39 /tools/perf/builtin-stat.c
parentae51ce9061b1ddc0fde363913c932bee5b9bc5fd (diff)
perf stat: Add no-aggregation mode to -a
This patch adds a new -A option to perf stat. If specified then perf stat does not aggregate counts across all monitored CPUs in system-wide mode, i.e., when using -a. This option is not supported in per-thread mode. Being able to get a per-cpu breakdown is useful to detect imbalances between CPUs when running a uniform workload than spans all monitored CPUs. The second version corrects the missing cpumap[] support, so that it works when the -C option is used. The third version fixes a missing cpumap[] in print_counter() and removes a stray patch in builtin-trace.c. Examples on a 4-way system: # perf stat -a -e cycles,instructions -- sleep 1 Performance counter stats for 'sleep 1': 9592808135 cycles 3490380006 instructions # 0.364 IPC 1.001584632 seconds time elapsed # perf stat -a -A -e cycles,instructions -- sleep 1 Performance counter stats for 'sleep 1': CPU0 2398163767 cycles CPU1 2398180817 cycles CPU2 2398217115 cycles CPU3 2398247483 cycles CPU0 872282046 instructions # 0.364 IPC CPU1 873481776 instructions # 0.364 IPC CPU2 872638127 instructions # 0.364 IPC CPU3 872437789 instructions # 0.364 IPC 1.001556052 seconds time elapsed Cc: David S. Miller <davem@davemloft.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robert Richter <robert.richter@amd.com> LKML-Reference: <4ce257b5.1e07e30a.7b6b.3aa9@mx.google.com> Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r--tools/perf/builtin-stat.c169
1 files changed, 144 insertions, 25 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a6b4d44f950..b3e568ffad2 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -75,6 +75,7 @@ static int run_idx = 0;
75static int run_count = 1; 75static int run_count = 1;
76static bool no_inherit = false; 76static bool no_inherit = false;
77static bool scale = true; 77static bool scale = true;
78static bool no_aggr = false;
78static pid_t target_pid = -1; 79static pid_t target_pid = -1;
79static pid_t target_tid = -1; 80static pid_t target_tid = -1;
80static pid_t *all_tids = NULL; 81static pid_t *all_tids = NULL;
@@ -89,6 +90,12 @@ static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
89 90
90static int event_scaled[MAX_COUNTERS]; 91static int event_scaled[MAX_COUNTERS];
91 92
93static struct {
94 u64 val;
95 u64 ena;
96 u64 run;
97} cpu_counts[MAX_NR_CPUS][MAX_COUNTERS];
98
92static volatile int done = 0; 99static volatile int done = 0;
93 100
94struct stats 101struct stats
@@ -136,10 +143,10 @@ static double stddev_stats(struct stats *stats)
136} 143}
137 144
138struct stats event_res_stats[MAX_COUNTERS][3]; 145struct stats event_res_stats[MAX_COUNTERS][3];
139struct stats runtime_nsecs_stats; 146struct stats runtime_nsecs_stats[MAX_NR_CPUS];
147struct stats runtime_cycles_stats[MAX_NR_CPUS];
148struct stats runtime_branches_stats[MAX_NR_CPUS];
140struct stats walltime_nsecs_stats; 149struct stats walltime_nsecs_stats;
141struct stats runtime_cycles_stats;
142struct stats runtime_branches_stats;
143 150
144#define MATCH_EVENT(t, c, counter) \ 151#define MATCH_EVENT(t, c, counter) \
145 (attrs[counter].type == PERF_TYPE_##t && \ 152 (attrs[counter].type == PERF_TYPE_##t && \
@@ -205,8 +212,9 @@ static inline int nsec_counter(int counter)
205 212
206/* 213/*
207 * Read out the results of a single counter: 214 * Read out the results of a single counter:
215 * aggregate counts across CPUs in system-wide mode
208 */ 216 */
209static void read_counter(int counter) 217static void read_counter_aggr(int counter)
210{ 218{
211 u64 count[3], single_count[3]; 219 u64 count[3], single_count[3];
212 int cpu; 220 int cpu;
@@ -264,11 +272,58 @@ static void read_counter(int counter)
264 * Save the full runtime - to allow normalization during printout: 272 * Save the full runtime - to allow normalization during printout:
265 */ 273 */
266 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) 274 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
267 update_stats(&runtime_nsecs_stats, count[0]); 275 update_stats(&runtime_nsecs_stats[0], count[0]);
268 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) 276 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
269 update_stats(&runtime_cycles_stats, count[0]); 277 update_stats(&runtime_cycles_stats[0], count[0]);
270 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) 278 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
271 update_stats(&runtime_branches_stats, count[0]); 279 update_stats(&runtime_branches_stats[0], count[0]);
280}
281
282/*
283 * Read out the results of a single counter:
284 * do not aggregate counts across CPUs in system-wide mode
285 */
286static void read_counter(int counter)
287{
288 u64 count[3];
289 int cpu;
290 size_t res, nv;
291
292 count[0] = count[1] = count[2] = 0;
293
294 nv = scale ? 3 : 1;
295
296 for (cpu = 0; cpu < nr_cpus; cpu++) {
297
298 if (fd[cpu][counter][0] < 0)
299 continue;
300
301 res = read(fd[cpu][counter][0], count, nv * sizeof(u64));
302
303 assert(res == nv * sizeof(u64));
304
305 close(fd[cpu][counter][0]);
306 fd[cpu][counter][0] = -1;
307
308 if (scale) {
309 if (count[2] == 0) {
310 count[0] = 0;
311 } else if (count[2] < count[1]) {
312 count[0] = (unsigned long long)
313 ((double)count[0] * count[1] / count[2] + 0.5);
314 }
315 }
316 cpu_counts[cpu][counter].val = count[0]; /* scaled count */
317 cpu_counts[cpu][counter].ena = count[1];
318 cpu_counts[cpu][counter].run = count[2];
319
320 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
321 update_stats(&runtime_nsecs_stats[cpu], count[0]);
322 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
323 update_stats(&runtime_cycles_stats[cpu], count[0]);
324 if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
325 update_stats(&runtime_branches_stats[cpu], count[0]);
326 }
272} 327}
273 328
274static int run_perf_stat(int argc __used, const char **argv) 329static int run_perf_stat(int argc __used, const char **argv)
@@ -362,9 +417,13 @@ static int run_perf_stat(int argc __used, const char **argv)
362 417
363 update_stats(&walltime_nsecs_stats, t1 - t0); 418 update_stats(&walltime_nsecs_stats, t1 - t0);
364 419
365 for (counter = 0; counter < nr_counters; counter++) 420 if (no_aggr) {
366 read_counter(counter); 421 for (counter = 0; counter < nr_counters; counter++)
367 422 read_counter(counter);
423 } else {
424 for (counter = 0; counter < nr_counters; counter++)
425 read_counter_aggr(counter);
426 }
368 return WEXITSTATUS(status); 427 return WEXITSTATUS(status);
369} 428}
370 429
@@ -377,11 +436,15 @@ static void print_noise(int counter, double avg)
377 100 * stddev_stats(&event_res_stats[counter][0]) / avg); 436 100 * stddev_stats(&event_res_stats[counter][0]) / avg);
378} 437}
379 438
380static void nsec_printout(int counter, double avg) 439static void nsec_printout(int cpu, int counter, double avg)
381{ 440{
382 double msecs = avg / 1e6; 441 double msecs = avg / 1e6;
383 442
384 fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); 443 if (no_aggr)
444 fprintf(stderr, "CPU%-4d %18.6f %-24s",
445 cpumap[cpu], msecs, event_name(counter));
446 else
447 fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
385 448
386 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { 449 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
387 fprintf(stderr, " # %10.3f CPUs ", 450 fprintf(stderr, " # %10.3f CPUs ",
@@ -389,33 +452,41 @@ static void nsec_printout(int counter, double avg)
389 } 452 }
390} 453}
391 454
392static void abs_printout(int counter, double avg) 455static void abs_printout(int cpu, int counter, double avg)
393{ 456{
394 double total, ratio = 0.0; 457 double total, ratio = 0.0;
458 char cpustr[16] = { '\0', };
459
460 if (no_aggr)
461 sprintf(cpustr, "CPU%-4d", cpumap[cpu]);
462 else
463 cpu = 0;
395 464
396 if (big_num) 465 if (big_num)
397 fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); 466 fprintf(stderr, "%s %'18.0f %-24s",
467 cpustr, avg, event_name(counter));
398 else 468 else
399 fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); 469 fprintf(stderr, "%s %18.0f %-24s",
470 cpustr, avg, event_name(counter));
400 471
401 if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { 472 if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
402 total = avg_stats(&runtime_cycles_stats); 473 total = avg_stats(&runtime_cycles_stats[cpu]);
403 474
404 if (total) 475 if (total)
405 ratio = avg / total; 476 ratio = avg / total;
406 477
407 fprintf(stderr, " # %10.3f IPC ", ratio); 478 fprintf(stderr, " # %10.3f IPC ", ratio);
408 } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && 479 } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
409 runtime_branches_stats.n != 0) { 480 runtime_branches_stats[cpu].n != 0) {
410 total = avg_stats(&runtime_branches_stats); 481 total = avg_stats(&runtime_branches_stats[cpu]);
411 482
412 if (total) 483 if (total)
413 ratio = avg * 100 / total; 484 ratio = avg * 100 / total;
414 485
415 fprintf(stderr, " # %10.3f %% ", ratio); 486 fprintf(stderr, " # %10.3f %% ", ratio);
416 487
417 } else if (runtime_nsecs_stats.n != 0) { 488 } else if (runtime_nsecs_stats[cpu].n != 0) {
418 total = avg_stats(&runtime_nsecs_stats); 489 total = avg_stats(&runtime_nsecs_stats[cpu]);
419 490
420 if (total) 491 if (total)
421 ratio = 1000.0 * avg / total; 492 ratio = 1000.0 * avg / total;
@@ -426,8 +497,9 @@ static void abs_printout(int counter, double avg)
426 497
427/* 498/*
428 * Print out the results of a single counter: 499 * Print out the results of a single counter:
500 * aggregated counts in system-wide mode
429 */ 501 */
430static void print_counter(int counter) 502static void print_counter_aggr(int counter)
431{ 503{
432 double avg = avg_stats(&event_res_stats[counter][0]); 504 double avg = avg_stats(&event_res_stats[counter][0]);
433 int scaled = event_scaled[counter]; 505 int scaled = event_scaled[counter];
@@ -439,9 +511,9 @@ static void print_counter(int counter)
439 } 511 }
440 512
441 if (nsec_counter(counter)) 513 if (nsec_counter(counter))
442 nsec_printout(counter, avg); 514 nsec_printout(-1, counter, avg);
443 else 515 else
444 abs_printout(counter, avg); 516 abs_printout(-1, counter, avg);
445 517
446 print_noise(counter, avg); 518 print_noise(counter, avg);
447 519
@@ -458,6 +530,42 @@ static void print_counter(int counter)
458 fprintf(stderr, "\n"); 530 fprintf(stderr, "\n");
459} 531}
460 532
533/*
534 * Print out the results of a single counter:
535 * does not use aggregated count in system-wide
536 */
537static void print_counter(int counter)
538{
539 u64 ena, run, val;
540 int cpu;
541
542 for (cpu = 0; cpu < nr_cpus; cpu++) {
543 val = cpu_counts[cpu][counter].val;
544 ena = cpu_counts[cpu][counter].ena;
545 run = cpu_counts[cpu][counter].run;
546 if (run == 0 || ena == 0) {
547 fprintf(stderr, "CPU%-4d %18s %-24s", cpumap[cpu],
548 "<not counted>", event_name(counter));
549
550 fprintf(stderr, "\n");
551 continue;
552 }
553
554 if (nsec_counter(counter))
555 nsec_printout(cpu, counter, val);
556 else
557 abs_printout(cpu, counter, val);
558
559 print_noise(counter, 1.0);
560
561 if (run != ena) {
562 fprintf(stderr, " (scaled from %.2f%%)",
563 100.0 * run / ena);
564 }
565 fprintf(stderr, "\n");
566 }
567}
568
461static void print_stat(int argc, const char **argv) 569static void print_stat(int argc, const char **argv)
462{ 570{
463 int i, counter; 571 int i, counter;
@@ -480,8 +588,13 @@ static void print_stat(int argc, const char **argv)
480 fprintf(stderr, " (%d runs)", run_count); 588 fprintf(stderr, " (%d runs)", run_count);
481 fprintf(stderr, ":\n\n"); 589 fprintf(stderr, ":\n\n");
482 590
483 for (counter = 0; counter < nr_counters; counter++) 591 if (no_aggr) {
484 print_counter(counter); 592 for (counter = 0; counter < nr_counters; counter++)
593 print_counter(counter);
594 } else {
595 for (counter = 0; counter < nr_counters; counter++)
596 print_counter_aggr(counter);
597 }
485 598
486 fprintf(stderr, "\n"); 599 fprintf(stderr, "\n");
487 fprintf(stderr, " %18.9f seconds time elapsed", 600 fprintf(stderr, " %18.9f seconds time elapsed",
@@ -545,6 +658,8 @@ static const struct option options[] = {
545 "print large numbers with thousands\' separators"), 658 "print large numbers with thousands\' separators"),
546 OPT_STRING('C', "cpu", &cpu_list, "cpu", 659 OPT_STRING('C', "cpu", &cpu_list, "cpu",
547 "list of cpus to monitor in system-wide"), 660 "list of cpus to monitor in system-wide"),
661 OPT_BOOLEAN('A', "no-aggr", &no_aggr,
662 "disable CPU count aggregation"),
548 OPT_END() 663 OPT_END()
549}; 664};
550 665
@@ -562,6 +677,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
562 if (run_count <= 0) 677 if (run_count <= 0)
563 usage_with_options(stat_usage, options); 678 usage_with_options(stat_usage, options);
564 679
680 /* no_aggr is for system-wide only */
681 if (no_aggr && !system_wide)
682 usage_with_options(stat_usage, options);
683
565 /* Set attrs and nr_counters if no event is selected and !null_run */ 684 /* Set attrs and nr_counters if no event is selected and !null_run */
566 if (!null_run && !nr_counters) { 685 if (!null_run && !nr_counters) {
567 memcpy(attrs, default_attrs, sizeof(default_attrs)); 686 memcpy(attrs, default_attrs, sizeof(default_attrs));