aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2016-05-24 15:52:37 -0400
committerArnaldo Carvalho de Melo <acme@redhat.com>2016-06-06 16:04:16 -0400
commit239bd47f8355eb5defc865cf408824b6cfeca5dc (patch)
tree6786c6f67d36700c3757574b864c0549a36c5670
parent44b1e60ab576c343aa592a2a6c679297cc69740d (diff)
perf stat: Add computation of TopDown formulas
Implement the TopDown formulas in 'perf stat'. The topdown basic metrics reported by the kernel are collected, and the formulas are computed and output as normal metrics. See the kernel commit exporting the events for details on the used metrics. Committer note: Output example: # perf stat --topdown -a usleep 1 Performance counter stats for 'system wide': retiring bad speculation frontend bound backend bound S0-C0 2 23.8% 11.6% 28.3% 36.3% S0-C1 2 16.2% 15.7% 36.5% 31.6% 0.000579956 seconds time elapsed # v2: Always print all metrics, only use thresholds for coloring. v3: Mark retiring over threshold green, not red. v4: Only print one decimal digit Fix color printing of one metric v5: Avoid printing -0.0 v6: Remove extra frontend event lookup Signed-off-by: Andi Kleen <ak@linux.intel.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Link: http://lkml.kernel.org/r/1464119559-17203-2-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
-rw-r--r--tools/perf/util/stat-shadow.c162
-rw-r--r--tools/perf/util/stat.c5
-rw-r--r--tools/perf/util/stat.h5
3 files changed, 172 insertions, 0 deletions
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index aa9efe08762b..8a2bbd2a4d82 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -36,6 +36,11 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
36static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS]; 36static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
37static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS]; 37static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
38static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS]; 38static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
39static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
40static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
41static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
42static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
43static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
39static bool have_frontend_stalled; 44static bool have_frontend_stalled;
40 45
41struct stats walltime_nsecs_stats; 46struct stats walltime_nsecs_stats;
@@ -82,6 +87,11 @@ void perf_stat__reset_shadow_stats(void)
82 sizeof(runtime_transaction_stats)); 87 sizeof(runtime_transaction_stats));
83 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 88 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
84 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 89 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
90 memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
91 memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
92 memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
93 memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
94 memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
85} 95}
86 96
87/* 97/*
@@ -105,6 +115,16 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
105 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]); 115 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
106 else if (perf_stat_evsel__is(counter, ELISION_START)) 116 else if (perf_stat_evsel__is(counter, ELISION_START))
107 update_stats(&runtime_elision_stats[ctx][cpu], count[0]); 117 update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
118 else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
119 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
120 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
121 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
122 else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
123 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
124 else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
125 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
126 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
127 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
108 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 128 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
109 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]); 129 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
110 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 130 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -302,6 +322,107 @@ static void print_ll_cache_misses(int cpu,
302 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio); 322 out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
303} 323}
304 324
325/*
326 * High level "TopDown" CPU core pipe line bottleneck break down.
327 *
328 * Basic concept following
329 * Yasin, A Top Down Method for Performance analysis and Counter architecture
330 * ISPASS14
331 *
332 * The CPU pipeline is divided into 4 areas that can be bottlenecks:
333 *
334 * Frontend -> Backend -> Retiring
335 * BadSpeculation in addition means out of order execution that is thrown away
336 * (for example branch mispredictions)
337 * Frontend is instruction decoding.
338 * Backend is execution, like computation and accessing data in memory
339 * Retiring is good execution that is not directly bottlenecked
340 *
341 * The formulas are computed in slots.
342 * A slot is an entry in the pipeline each for the pipeline width
343 * (for example a 4-wide pipeline has 4 slots for each cycle)
344 *
345 * Formulas:
346 * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
347 * TotalSlots
348 * Retiring = SlotsRetired / TotalSlots
349 * FrontendBound = FetchBubbles / TotalSlots
350 * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
351 *
352 * The kernel provides the mapping to the low level CPU events and any scaling
353 * needed for the CPU pipeline width, for example:
354 *
355 * TotalSlots = Cycles * 4
356 *
357 * The scaling factor is communicated in the sysfs unit.
358 *
359 * In some cases the CPU may not be able to measure all the formulas due to
360 * missing events. In this case multiple formulas are combined, as possible.
361 *
362 * Full TopDown supports more levels to sub-divide each area: for example
363 * BackendBound into computing bound and memory bound. For now we only
364 * support Level 1 TopDown.
365 */
366
367static double sanitize_val(double x)
368{
369 if (x < 0 && x >= -0.02)
370 return 0.0;
371 return x;
372}
373
374static double td_total_slots(int ctx, int cpu)
375{
376 return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
377}
378
379static double td_bad_spec(int ctx, int cpu)
380{
381 double bad_spec = 0;
382 double total_slots;
383 double total;
384
385 total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
386 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
387 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
388 total_slots = td_total_slots(ctx, cpu);
389 if (total_slots)
390 bad_spec = total / total_slots;
391 return sanitize_val(bad_spec);
392}
393
394static double td_retiring(int ctx, int cpu)
395{
396 double retiring = 0;
397 double total_slots = td_total_slots(ctx, cpu);
398 double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
399
400 if (total_slots)
401 retiring = ret_slots / total_slots;
402 return retiring;
403}
404
405static double td_fe_bound(int ctx, int cpu)
406{
407 double fe_bound = 0;
408 double total_slots = td_total_slots(ctx, cpu);
409 double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
410
411 if (total_slots)
412 fe_bound = fetch_bub / total_slots;
413 return fe_bound;
414}
415
416static double td_be_bound(int ctx, int cpu)
417{
418 double sum = (td_fe_bound(ctx, cpu) +
419 td_bad_spec(ctx, cpu) +
420 td_retiring(ctx, cpu));
421 if (sum == 0)
422 return 0;
423 return sanitize_val(1.0 - sum);
424}
425
305void perf_stat__print_shadow_stats(struct perf_evsel *evsel, 426void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
306 double avg, int cpu, 427 double avg, int cpu,
307 struct perf_stat_output_ctx *out) 428 struct perf_stat_output_ctx *out)
@@ -309,6 +430,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
309 void *ctxp = out->ctx; 430 void *ctxp = out->ctx;
310 print_metric_t print_metric = out->print_metric; 431 print_metric_t print_metric = out->print_metric;
311 double total, ratio = 0.0, total2; 432 double total, ratio = 0.0, total2;
433 const char *color = NULL;
312 int ctx = evsel_context(evsel); 434 int ctx = evsel_context(evsel);
313 435
314 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 436 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
@@ -452,6 +574,46 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
452 avg / ratio); 574 avg / ratio);
453 else 575 else
454 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0); 576 print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
577 } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
578 double fe_bound = td_fe_bound(ctx, cpu);
579
580 if (fe_bound > 0.2)
581 color = PERF_COLOR_RED;
582 print_metric(ctxp, color, "%8.1f%%", "frontend bound",
583 fe_bound * 100.);
584 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
585 double retiring = td_retiring(ctx, cpu);
586
587 if (retiring > 0.7)
588 color = PERF_COLOR_GREEN;
589 print_metric(ctxp, color, "%8.1f%%", "retiring",
590 retiring * 100.);
591 } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
592 double bad_spec = td_bad_spec(ctx, cpu);
593
594 if (bad_spec > 0.1)
595 color = PERF_COLOR_RED;
596 print_metric(ctxp, color, "%8.1f%%", "bad speculation",
597 bad_spec * 100.);
598 } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
599 double be_bound = td_be_bound(ctx, cpu);
600 const char *name = "backend bound";
601 static int have_recovery_bubbles = -1;
602
603 /* In case the CPU does not support topdown-recovery-bubbles */
604 if (have_recovery_bubbles < 0)
605 have_recovery_bubbles = pmu_have_event("cpu",
606 "topdown-recovery-bubbles");
607 if (!have_recovery_bubbles)
608 name = "backend bound/bad spec";
609
610 if (be_bound > 0.2)
611 color = PERF_COLOR_RED;
612 if (td_total_slots(ctx, cpu) > 0)
613 print_metric(ctxp, color, "%8.1f%%", name,
614 be_bound * 100.);
615 else
616 print_metric(ctxp, NULL, NULL, name, 0);
455 } else if (runtime_nsecs_stats[cpu].n != 0) { 617 } else if (runtime_nsecs_stats[cpu].n != 0) {
456 char unit = 'M'; 618 char unit = 'M';
457 char unit_buf[10]; 619 char unit_buf[10];
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index ffa1d0653861..c1ba255f2abe 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -79,6 +79,11 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
79 ID(TRANSACTION_START, cpu/tx-start/), 79 ID(TRANSACTION_START, cpu/tx-start/),
80 ID(ELISION_START, cpu/el-start/), 80 ID(ELISION_START, cpu/el-start/),
81 ID(CYCLES_IN_TX_CP, cpu/cycles-ct/), 81 ID(CYCLES_IN_TX_CP, cpu/cycles-ct/),
82 ID(TOPDOWN_TOTAL_SLOTS, topdown-total-slots),
83 ID(TOPDOWN_SLOTS_ISSUED, topdown-slots-issued),
84 ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
85 ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
86 ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
82}; 87};
83#undef ID 88#undef ID
84 89
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0150e786ccc7..c29bb94c48a4 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -17,6 +17,11 @@ enum perf_stat_evsel_id {
17 PERF_STAT_EVSEL_ID__TRANSACTION_START, 17 PERF_STAT_EVSEL_ID__TRANSACTION_START,
18 PERF_STAT_EVSEL_ID__ELISION_START, 18 PERF_STAT_EVSEL_ID__ELISION_START,
19 PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP, 19 PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
20 PERF_STAT_EVSEL_ID__TOPDOWN_TOTAL_SLOTS,
21 PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_ISSUED,
22 PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
23 PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
24 PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
20 PERF_STAT_EVSEL_ID__MAX, 25 PERF_STAT_EVSEL_ID__MAX,
21}; 26};
22 27