aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/builtin-script.c
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2017-02-23 18:46:34 -0500
committerArnaldo Carvalho de Melo <acme@redhat.com>2017-03-16 08:24:35 -0400
commit48d02a1d5c137d362defd11a5d57d0af4a75a983 (patch)
tree8081aba27b3fc1346479bae1b118040d45f8772b /tools/perf/builtin-script.c
parent74beb09a80ca248ba9d54dede80aebc2557ccb30 (diff)
perf script: Add 'brstackinsn' for branch stacks
Implement printing instruction sequences as hex dump for branch stacks. This relies on the x86 instruction decoder used by the PT decoder to find the lengths of instructions to dump them individually. This is good enough for pattern matching. This allows to study hot paths for individual samples, together with branch misprediction and cycle count / IPC information if available (on Skylake systems). % perf record -b ... % perf script -F brstackinsn ... read_hpet+67: ffffffff9905b843 insn: 74 ea # PRED ffffffff9905b82f insn: 85 c9 ffffffff9905b831 insn: 74 12 ffffffff9905b833 insn: f3 90 ffffffff9905b835 insn: 48 8b 0f ffffffff9905b838 insn: 48 89 ca ffffffff9905b83b insn: 48 c1 ea 20 ffffffff9905b83f insn: 39 f2 ffffffff9905b841 insn: 89 d0 ffffffff9905b843 insn: 74 ea # PRED Only works when no special branch filters are specified. Occasionally the path does not reach up to the sample IP, as the LBRs may be frozen before executing a final jump. In this case we print a special message. The instruction dumper piggy backs on the existing infrastructure from the IP PT decoder. An earlier iteration of this patch relied on a disassembler, but this version only uses the existing instruction decoder. Committer note: Added hint about how to get suitable perf.data files for use with '-F brstackinsm': $ perf record usleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.018 MB perf.data (8 samples) ] $ $ perf script -F brstackinsn Display of branch stack assembler requested, but non all-branch filter set Hint: run 'perf record -b ...' $ Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Link: http://lkml.kernel.org/r/20170223234634.583-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/builtin-script.c')
-rw-r--r--tools/perf/builtin-script.c264
1 files changed, 255 insertions, 9 deletions
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 66d62c98dff9..c98e16689b57 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,7 @@
28#include <linux/time64.h> 28#include <linux/time64.h>
29#include "asm/bug.h" 29#include "asm/bug.h"
30#include "util/mem-events.h" 30#include "util/mem-events.h"
31#include "util/dump-insn.h"
31 32
32static char const *script_name; 33static char const *script_name;
33static char const *generate_script_lang; 34static char const *generate_script_lang;
@@ -42,6 +43,7 @@ static bool nanosecs;
42static const char *cpu_list; 43static const char *cpu_list;
43static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 44static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
44static struct perf_stat_config stat_config; 45static struct perf_stat_config stat_config;
46static int max_blocks;
45 47
46unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH; 48unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
47 49
@@ -69,6 +71,7 @@ enum perf_output_field {
69 PERF_OUTPUT_CALLINDENT = 1U << 20, 71 PERF_OUTPUT_CALLINDENT = 1U << 20,
70 PERF_OUTPUT_INSN = 1U << 21, 72 PERF_OUTPUT_INSN = 1U << 21,
71 PERF_OUTPUT_INSNLEN = 1U << 22, 73 PERF_OUTPUT_INSNLEN = 1U << 22,
74 PERF_OUTPUT_BRSTACKINSN = 1U << 23,
72}; 75};
73 76
74struct output_option { 77struct output_option {
@@ -98,6 +101,7 @@ struct output_option {
98 {.str = "callindent", .field = PERF_OUTPUT_CALLINDENT}, 101 {.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
99 {.str = "insn", .field = PERF_OUTPUT_INSN}, 102 {.str = "insn", .field = PERF_OUTPUT_INSN},
100 {.str = "insnlen", .field = PERF_OUTPUT_INSNLEN}, 103 {.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
104 {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
101}; 105};
102 106
103/* default set to maintain compatibility with current format */ 107/* default set to maintain compatibility with current format */
@@ -292,7 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
292 "selected. Hence, no address to lookup the source line number.\n"); 296 "selected. Hence, no address to lookup the source line number.\n");
293 return -EINVAL; 297 return -EINVAL;
294 } 298 }
295 299 if (PRINT_FIELD(BRSTACKINSN) &&
300 !(perf_evlist__combined_branch_type(session->evlist) &
301 PERF_SAMPLE_BRANCH_ANY)) {
302 pr_err("Display of branch stack assembler requested, but non all-branch filter set\n"
303 "Hint: run 'perf record -b ...'\n");
304 return -EINVAL;
305 }
296 if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && 306 if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
297 perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", 307 perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
298 PERF_OUTPUT_TID|PERF_OUTPUT_PID)) 308 PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -546,6 +556,233 @@ static void print_sample_brstacksym(struct perf_sample *sample,
546 } 556 }
547} 557}
548 558
559#define MAXBB 16384UL
560
561static int grab_bb(u8 *buffer, u64 start, u64 end,
562 struct machine *machine, struct thread *thread,
563 bool *is64bit, u8 *cpumode, bool last)
564{
565 long offset, len;
566 struct addr_location al;
567 bool kernel;
568
569 if (!start || !end)
570 return 0;
571
572 kernel = machine__kernel_ip(machine, start);
573 if (kernel)
574 *cpumode = PERF_RECORD_MISC_KERNEL;
575 else
576 *cpumode = PERF_RECORD_MISC_USER;
577
578 /*
579 * Block overlaps between kernel and user.
580 * This can happen due to ring filtering
581 * On Intel CPUs the entry into the kernel is filtered,
582 * but the exit is not. Let the caller patch it up.
583 */
584 if (kernel != machine__kernel_ip(machine, end)) {
585 printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
586 start, end);
587 return -ENXIO;
588 }
589
590 memset(&al, 0, sizeof(al));
591 if (end - start > MAXBB - MAXINSN) {
592 if (last)
593 printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end);
594 else
595 printf("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start);
596 return 0;
597 }
598
599 thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
600 if (!al.map || !al.map->dso) {
601 printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
602 return 0;
603 }
604 if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
605 printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
606 return 0;
607 }
608
609 /* Load maps to ensure dso->is_64_bit has been updated */
610 map__load(al.map);
611
612 offset = al.map->map_ip(al.map, start);
613 len = dso__data_read_offset(al.map->dso, machine, offset, (u8 *)buffer,
614 end - start + MAXINSN);
615
616 *is64bit = al.map->dso->is_64_bit;
617 if (len <= 0)
618 printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n",
619 start, end);
620 return len;
621}
622
623static void print_jump(uint64_t ip, struct branch_entry *en,
624 struct perf_insn *x, u8 *inbuf, int len,
625 int insn)
626{
627 printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
628 ip,
629 dump_insn(x, ip, inbuf, len, NULL),
630 en->flags.predicted ? " PRED" : "",
631 en->flags.mispred ? " MISPRED" : "",
632 en->flags.in_tx ? " INTX" : "",
633 en->flags.abort ? " ABORT" : "");
634 if (en->flags.cycles) {
635 printf(" %d cycles", en->flags.cycles);
636 if (insn)
637 printf(" %.2f IPC", (float)insn / en->flags.cycles);
638 }
639 putchar('\n');
640}
641
642static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu,
643 uint64_t addr, struct symbol **lastsym,
644 struct perf_event_attr *attr)
645{
646 struct addr_location al;
647 int off;
648
649 memset(&al, 0, sizeof(al));
650
651 thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
652 if (!al.map)
653 thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
654 addr, &al);
655 if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
656 return;
657
658 al.cpu = cpu;
659 al.sym = NULL;
660 if (al.map)
661 al.sym = map__find_symbol(al.map, al.addr);
662
663 if (!al.sym)
664 return;
665
666 if (al.addr < al.sym->end)
667 off = al.addr - al.sym->start;
668 else
669 off = al.addr - al.map->start - al.sym->start;
670 printf("\t%s", al.sym->name);
671 if (off)
672 printf("%+d", off);
673 putchar(':');
674 if (PRINT_FIELD(SRCLINE))
675 map__fprintf_srcline(al.map, al.addr, "\t", stdout);
676 putchar('\n');
677 *lastsym = al.sym;
678}
679
680static void print_sample_brstackinsn(struct perf_sample *sample,
681 struct thread *thread,
682 struct perf_event_attr *attr,
683 struct machine *machine)
684{
685 struct branch_stack *br = sample->branch_stack;
686 u64 start, end;
687 int i, insn, len, nr, ilen;
688 struct perf_insn x;
689 u8 buffer[MAXBB];
690 unsigned off;
691 struct symbol *lastsym = NULL;
692
693 if (!(br && br->nr))
694 return;
695 nr = br->nr;
696 if (max_blocks && nr > max_blocks + 1)
697 nr = max_blocks + 1;
698
699 x.thread = thread;
700 x.cpu = sample->cpu;
701
702 putchar('\n');
703
704 /* Handle first from jump, of which we don't know the entry. */
705 len = grab_bb(buffer, br->entries[nr-1].from,
706 br->entries[nr-1].from,
707 machine, thread, &x.is64bit, &x.cpumode, false);
708 if (len > 0) {
709 print_ip_sym(thread, x.cpumode, x.cpu,
710 br->entries[nr - 1].from, &lastsym, attr);
711 print_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
712 &x, buffer, len, 0);
713 }
714
715 /* Print all blocks */
716 for (i = nr - 2; i >= 0; i--) {
717 if (br->entries[i].from || br->entries[i].to)
718 pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
719 br->entries[i].from,
720 br->entries[i].to);
721 start = br->entries[i + 1].to;
722 end = br->entries[i].from;
723
724 len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
725 /* Patch up missing kernel transfers due to ring filters */
726 if (len == -ENXIO && i > 0) {
727 end = br->entries[--i].from;
728 pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
729 len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
730 }
731 if (len <= 0)
732 continue;
733
734 insn = 0;
735 for (off = 0;; off += ilen) {
736 uint64_t ip = start + off;
737
738 print_ip_sym(thread, x.cpumode, x.cpu, ip, &lastsym, attr);
739 if (ip == end) {
740 print_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn);
741 break;
742 } else {
743 printf("\t%016" PRIx64 "\t%s\n", ip,
744 dump_insn(&x, ip, buffer + off, len - off, &ilen));
745 if (ilen == 0)
746 break;
747 insn++;
748 }
749 }
750 }
751
752 /*
753 * Hit the branch? In this case we are already done, and the target
754 * has not been executed yet.
755 */
756 if (br->entries[0].from == sample->ip)
757 return;
758 if (br->entries[0].flags.abort)
759 return;
760
761 /*
762 * Print final block upto sample
763 */
764 start = br->entries[0].to;
765 end = sample->ip;
766 len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, true);
767 print_ip_sym(thread, x.cpumode, x.cpu, start, &lastsym, attr);
768 if (len <= 0) {
769 /* Print at least last IP if basic block did not work */
770 len = grab_bb(buffer, sample->ip, sample->ip,
771 machine, thread, &x.is64bit, &x.cpumode, false);
772 if (len <= 0)
773 return;
774
775 printf("\t%016" PRIx64 "\t%s\n", sample->ip,
776 dump_insn(&x, sample->ip, buffer, len, NULL));
777 return;
778 }
779 for (off = 0; off <= end - start; off += ilen) {
780 printf("\t%016" PRIx64 "\t%s\n", start + off,
781 dump_insn(&x, start + off, buffer + off, len - off, &ilen));
782 if (ilen == 0)
783 break;
784 }
785}
549 786
550static void print_sample_addr(struct perf_sample *sample, 787static void print_sample_addr(struct perf_sample *sample,
551 struct thread *thread, 788 struct thread *thread,
@@ -632,7 +869,9 @@ static void print_sample_callindent(struct perf_sample *sample,
632} 869}
633 870
634static void print_insn(struct perf_sample *sample, 871static void print_insn(struct perf_sample *sample,
635 struct perf_event_attr *attr) 872 struct perf_event_attr *attr,
873 struct thread *thread,
874 struct machine *machine)
636{ 875{
637 if (PRINT_FIELD(INSNLEN)) 876 if (PRINT_FIELD(INSNLEN))
638 printf(" ilen: %d", sample->insn_len); 877 printf(" ilen: %d", sample->insn_len);
@@ -643,12 +882,15 @@ static void print_insn(struct perf_sample *sample,
643 for (i = 0; i < sample->insn_len; i++) 882 for (i = 0; i < sample->insn_len; i++)
644 printf(" %02x", (unsigned char)sample->insn[i]); 883 printf(" %02x", (unsigned char)sample->insn[i]);
645 } 884 }
885 if (PRINT_FIELD(BRSTACKINSN))
886 print_sample_brstackinsn(sample, thread, attr, machine);
646} 887}
647 888
648static void print_sample_bts(struct perf_sample *sample, 889static void print_sample_bts(struct perf_sample *sample,
649 struct perf_evsel *evsel, 890 struct perf_evsel *evsel,
650 struct thread *thread, 891 struct thread *thread,
651 struct addr_location *al) 892 struct addr_location *al,
893 struct machine *machine)
652{ 894{
653 struct perf_event_attr *attr = &evsel->attr; 895 struct perf_event_attr *attr = &evsel->attr;
654 bool print_srcline_last = false; 896 bool print_srcline_last = false;
@@ -689,7 +931,7 @@ static void print_sample_bts(struct perf_sample *sample,
689 if (print_srcline_last) 931 if (print_srcline_last)
690 map__fprintf_srcline(al->map, al->addr, "\n ", stdout); 932 map__fprintf_srcline(al->map, al->addr, "\n ", stdout);
691 933
692 print_insn(sample, attr); 934 print_insn(sample, attr, thread, machine);
693 935
694 printf("\n"); 936 printf("\n");
695} 937}
@@ -872,7 +1114,8 @@ static size_t data_src__printf(u64 data_src)
872 1114
873static void process_event(struct perf_script *script, 1115static void process_event(struct perf_script *script,
874 struct perf_sample *sample, struct perf_evsel *evsel, 1116 struct perf_sample *sample, struct perf_evsel *evsel,
875 struct addr_location *al) 1117 struct addr_location *al,
1118 struct machine *machine)
876{ 1119{
877 struct thread *thread = al->thread; 1120 struct thread *thread = al->thread;
878 struct perf_event_attr *attr = &evsel->attr; 1121 struct perf_event_attr *attr = &evsel->attr;
@@ -899,7 +1142,7 @@ static void process_event(struct perf_script *script,
899 print_sample_flags(sample->flags); 1142 print_sample_flags(sample->flags);
900 1143
901 if (is_bts_event(attr)) { 1144 if (is_bts_event(attr)) {
902 print_sample_bts(sample, evsel, thread, al); 1145 print_sample_bts(sample, evsel, thread, al, machine);
903 return; 1146 return;
904 } 1147 }
905 1148
@@ -937,7 +1180,7 @@ static void process_event(struct perf_script *script,
937 1180
938 if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) 1181 if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
939 print_sample_bpf_output(sample); 1182 print_sample_bpf_output(sample);
940 print_insn(sample, attr); 1183 print_insn(sample, attr, thread, machine);
941 printf("\n"); 1184 printf("\n");
942} 1185}
943 1186
@@ -1047,7 +1290,7 @@ static int process_sample_event(struct perf_tool *tool,
1047 if (scripting_ops) 1290 if (scripting_ops)
1048 scripting_ops->process_event(event, sample, evsel, &al); 1291 scripting_ops->process_event(event, sample, evsel, &al);
1049 else 1292 else
1050 process_event(scr, sample, evsel, &al); 1293 process_event(scr, sample, evsel, &al, machine);
1051 1294
1052out_put: 1295out_put:
1053 addr_location__put(&al); 1296 addr_location__put(&al);
@@ -2191,7 +2434,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
2191 "Valid types: hw,sw,trace,raw. " 2434 "Valid types: hw,sw,trace,raw. "
2192 "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," 2435 "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
2193 "addr,symoff,period,iregs,brstack,brstacksym,flags," 2436 "addr,symoff,period,iregs,brstack,brstacksym,flags,"
2194 "bpf-output,callindent,insn,insnlen", parse_output_fields), 2437 "bpf-output,callindent,insn,insnlen,brstackinsn",
2438 parse_output_fields),
2195 OPT_BOOLEAN('a', "all-cpus", &system_wide, 2439 OPT_BOOLEAN('a', "all-cpus", &system_wide,
2196 "system-wide collection from all CPUs"), 2440 "system-wide collection from all CPUs"),
2197 OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", 2441 OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -2222,6 +2466,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
2222 OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events, 2466 OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
2223 "Show namespace events (if recorded)"), 2467 "Show namespace events (if recorded)"),
2224 OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), 2468 OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
2469 OPT_INTEGER(0, "max-blocks", &max_blocks,
2470 "Maximum number of code blocks to dump with brstackinsn"),
2225 OPT_BOOLEAN(0, "ns", &nanosecs, 2471 OPT_BOOLEAN(0, "ns", &nanosecs,
2226 "Use 9 decimal places when displaying time"), 2472 "Use 9 decimal places when displaying time"),
2227 OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", 2473 OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",