aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/perf_counter/kerneltop.c
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/perf_counter/kerneltop.c')
-rw-r--r--Documentation/perf_counter/kerneltop.c1409
1 files changed, 0 insertions, 1409 deletions
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
deleted file mode 100644
index 042c1b83a872..000000000000
--- a/Documentation/perf_counter/kerneltop.c
+++ /dev/null
@@ -1,1409 +0,0 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#define _GNU_SOURCE
65#include <sys/types.h>
66#include <sys/stat.h>
67#include <sys/time.h>
68#include <unistd.h>
69#include <stdint.h>
70#include <stdlib.h>
71#include <string.h>
72#include <limits.h>
73#include <getopt.h>
74#include <assert.h>
75#include <fcntl.h>
76#include <stdio.h>
77#include <errno.h>
78#include <ctype.h>
79#include <time.h>
80#include <sched.h>
81#include <pthread.h>
82
83#include <sys/syscall.h>
84#include <sys/ioctl.h>
85#include <sys/poll.h>
86#include <sys/prctl.h>
87#include <sys/wait.h>
88#include <sys/uio.h>
89#include <sys/mman.h>
90
91#include <linux/unistd.h>
92#include <linux/types.h>
93
94#include "../../include/linux/perf_counter.h"
95
96
97/*
98 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
99 * counters in the current task.
100 */
101#define PR_TASK_PERF_COUNTERS_DISABLE 31
102#define PR_TASK_PERF_COUNTERS_ENABLE 32
103
104#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
105
106#define rdclock() \
107({ \
108 struct timespec ts; \
109 \
110 clock_gettime(CLOCK_MONOTONIC, &ts); \
111 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
112})
113
114/*
115 * Pick up some kernel type conventions:
116 */
117#define __user
118#define asmlinkage
119
120#ifdef __x86_64__
121#define __NR_perf_counter_open 295
122#define rmb() asm volatile("lfence" ::: "memory")
123#define cpu_relax() asm volatile("rep; nop" ::: "memory");
124#endif
125
126#ifdef __i386__
127#define __NR_perf_counter_open 333
128#define rmb() asm volatile("lfence" ::: "memory")
129#define cpu_relax() asm volatile("rep; nop" ::: "memory");
130#endif
131
132#ifdef __powerpc__
133#define __NR_perf_counter_open 319
134#define rmb() asm volatile ("sync" ::: "memory")
135#define cpu_relax() asm volatile ("" ::: "memory");
136#endif
137
138#define unlikely(x) __builtin_expect(!!(x), 0)
139#define min(x, y) ({ \
140 typeof(x) _min1 = (x); \
141 typeof(y) _min2 = (y); \
142 (void) (&_min1 == &_min2); \
143 _min1 < _min2 ? _min1 : _min2; })
144
145asmlinkage int sys_perf_counter_open(
146 struct perf_counter_hw_event *hw_event_uptr __user,
147 pid_t pid,
148 int cpu,
149 int group_fd,
150 unsigned long flags)
151{
152 return syscall(
153 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154}
155
156#define MAX_COUNTERS 64
157#define MAX_NR_CPUS 256
158
159#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
160
161static int run_perfstat = 0;
162static int system_wide = 0;
163
164static int nr_counters = 0;
165static __u64 event_id[MAX_COUNTERS] = {
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
168 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
169 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
170
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
173 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
174 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
175};
176static int default_interval = 100000;
177static int event_count[MAX_COUNTERS];
178static int fd[MAX_NR_CPUS][MAX_COUNTERS];
179
180static __u64 count_filter = 100;
181
182static int tid = -1;
183static int profile_cpu = -1;
184static int nr_cpus = 0;
185static int nmi = 1;
186static unsigned int realtime_prio = 0;
187static int group = 0;
188static unsigned int page_size;
189static unsigned int mmap_pages = 16;
190static int use_mmap = 0;
191static int use_munmap = 0;
192
193static char *vmlinux;
194
195static char *sym_filter;
196static unsigned long filter_start;
197static unsigned long filter_end;
198
199static int delay_secs = 2;
200static int zero;
201static int dump_symtab;
202
203static int scale;
204
205struct source_line {
206 uint64_t EIP;
207 unsigned long count;
208 char *line;
209 struct source_line *next;
210};
211
212static struct source_line *lines;
213static struct source_line **lines_tail;
214
215const unsigned int default_count[] = {
216 1000000,
217 1000000,
218 10000,
219 10000,
220 1000000,
221 10000,
222};
223
224static char *hw_event_names[] = {
225 "CPU cycles",
226 "instructions",
227 "cache references",
228 "cache misses",
229 "branches",
230 "branch misses",
231 "bus cycles",
232};
233
234static char *sw_event_names[] = {
235 "cpu clock ticks",
236 "task clock ticks",
237 "pagefaults",
238 "context switches",
239 "CPU migrations",
240 "minor faults",
241 "major faults",
242};
243
244struct event_symbol {
245 __u64 event;
246 char *symbol;
247};
248
249static struct event_symbol event_symbols[] = {
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
258 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
259
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
269 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
270};
271
272#define __PERF_COUNTER_FIELD(config, name) \
273 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
274
275#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
276#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
277#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
278#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
279
280static void display_events_help(void)
281{
282 unsigned int i;
283 __u64 e;
284
285 printf(
286 " -e EVENT --event=EVENT # symbolic-name abbreviations");
287
288 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
289 int type, id;
290
291 e = event_symbols[i].event;
292 type = PERF_COUNTER_TYPE(e);
293 id = PERF_COUNTER_ID(e);
294
295 printf("\n %d:%d: %-20s",
296 type, id, event_symbols[i].symbol);
297 }
298
299 printf("\n"
300 " rNNN: raw PMU events (eventsel+umask)\n\n");
301}
302
303static void display_perfstat_help(void)
304{
305 printf(
306 "Usage: perfstat [<events...>] <cmd...>\n\n"
307 "PerfStat Options (up to %d event types can be specified):\n\n",
308 MAX_COUNTERS);
309
310 display_events_help();
311
312 printf(
313 " -l # scale counter values\n"
314 " -a # system-wide collection\n");
315 exit(0);
316}
317
318static void display_help(void)
319{
320 if (run_perfstat)
321 return display_perfstat_help();
322
323 printf(
324 "Usage: kerneltop [<options>]\n"
325 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
326 "KernelTop Options (up to %d event types can be specified at once):\n\n",
327 MAX_COUNTERS);
328
329 display_events_help();
330
331 printf(
332 " -S --stat # perfstat COMMAND\n"
333 " -a # system-wide collection (for perfstat)\n\n"
334 " -c CNT --count=CNT # event period to sample\n\n"
335 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
336 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
337 " -l # show scale factor for RR events\n"
338 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
339 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
340 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
341 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
342 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
343 " -z --zero # zero counts after display\n"
344 " -D --dump_symtab # dump symbol table to stderr on startup\n"
345 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
346 " -M --mmap_info # print mmap info stream\n"
347 " -U --munmap_info # print munmap info stream\n"
348 );
349
350 exit(0);
351}
352
353static char *event_name(int ctr)
354{
355 __u64 config = event_id[ctr];
356 int type = PERF_COUNTER_TYPE(config);
357 int id = PERF_COUNTER_ID(config);
358 static char buf[32];
359
360 if (PERF_COUNTER_RAW(config)) {
361 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
362 return buf;
363 }
364
365 switch (type) {
366 case PERF_TYPE_HARDWARE:
367 if (id < PERF_HW_EVENTS_MAX)
368 return hw_event_names[id];
369 return "unknown-hardware";
370
371 case PERF_TYPE_SOFTWARE:
372 if (id < PERF_SW_EVENTS_MAX)
373 return sw_event_names[id];
374 return "unknown-software";
375
376 default:
377 break;
378 }
379
380 return "unknown";
381}
382
383/*
384 * Each event can have multiple symbolic names.
385 * Symbolic names are (almost) exactly matched.
386 */
387static __u64 match_event_symbols(char *str)
388{
389 __u64 config, id;
390 int type;
391 unsigned int i;
392
393 if (sscanf(str, "r%llx", &config) == 1)
394 return config | PERF_COUNTER_RAW_MASK;
395
396 if (sscanf(str, "%d:%llu", &type, &id) == 2)
397 return EID(type, id);
398
399 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
400 if (!strncmp(str, event_symbols[i].symbol,
401 strlen(event_symbols[i].symbol)))
402 return event_symbols[i].event;
403 }
404
405 return ~0ULL;
406}
407
408static int parse_events(char *str)
409{
410 __u64 config;
411
412again:
413 if (nr_counters == MAX_COUNTERS)
414 return -1;
415
416 config = match_event_symbols(str);
417 if (config == ~0ULL)
418 return -1;
419
420 event_id[nr_counters] = config;
421 nr_counters++;
422
423 str = strstr(str, ",");
424 if (str) {
425 str++;
426 goto again;
427 }
428
429 return 0;
430}
431
432
433/*
434 * perfstat
435 */
436
437char fault_here[1000000];
438
439static void create_perfstat_counter(int counter)
440{
441 struct perf_counter_hw_event hw_event;
442
443 memset(&hw_event, 0, sizeof(hw_event));
444 hw_event.config = event_id[counter];
445 hw_event.record_type = 0;
446 hw_event.nmi = 0;
447 if (scale)
448 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
449 PERF_FORMAT_TOTAL_TIME_RUNNING;
450
451 if (system_wide) {
452 int cpu;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
455 if (fd[cpu][counter] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd[cpu][counter], strerror(errno));
458 exit(-1);
459 }
460 }
461 } else {
462 hw_event.inherit = 1;
463 hw_event.disabled = 1;
464
465 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
466 if (fd[0][counter] < 0) {
467 printf("perfstat error: syscall returned with %d (%s)\n",
468 fd[0][counter], strerror(errno));
469 exit(-1);
470 }
471 }
472}
473
474int do_perfstat(int argc, char *argv[])
475{
476 unsigned long long t0, t1;
477 int counter;
478 ssize_t res;
479 int status;
480 int pid;
481
482 if (!system_wide)
483 nr_cpus = 1;
484
485 for (counter = 0; counter < nr_counters; counter++)
486 create_perfstat_counter(counter);
487
488 argc -= optind;
489 argv += optind;
490
491 if (!argc)
492 display_help();
493
494 /*
495 * Enable counters and exec the command:
496 */
497 t0 = rdclock();
498 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
499
500 if ((pid = fork()) < 0)
501 perror("failed to fork");
502 if (!pid) {
503 if (execvp(argv[0], argv)) {
504 perror(argv[0]);
505 exit(-1);
506 }
507 }
508 while (wait(&status) >= 0)
509 ;
510 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
511 t1 = rdclock();
512
513 fflush(stdout);
514
515 fprintf(stderr, "\n");
516 fprintf(stderr, " Performance counter stats for \'%s\':\n",
517 argv[0]);
518 fprintf(stderr, "\n");
519
520 for (counter = 0; counter < nr_counters; counter++) {
521 int cpu, nv;
522 __u64 count[3], single_count[3];
523 int scaled;
524
525 count[0] = count[1] = count[2] = 0;
526 nv = scale ? 3 : 1;
527 for (cpu = 0; cpu < nr_cpus; cpu ++) {
528 res = read(fd[cpu][counter],
529 single_count, nv * sizeof(__u64));
530 assert(res == nv * sizeof(__u64));
531
532 count[0] += single_count[0];
533 if (scale) {
534 count[1] += single_count[1];
535 count[2] += single_count[2];
536 }
537 }
538
539 scaled = 0;
540 if (scale) {
541 if (count[2] == 0) {
542 fprintf(stderr, " %14s %-20s\n",
543 "<not counted>", event_name(counter));
544 continue;
545 }
546 if (count[2] < count[1]) {
547 scaled = 1;
548 count[0] = (unsigned long long)
549 ((double)count[0] * count[1] / count[2] + 0.5);
550 }
551 }
552
553 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
554 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
555
556 double msecs = (double)count[0] / 1000000;
557
558 fprintf(stderr, " %14.6f %-20s (msecs)",
559 msecs, event_name(counter));
560 } else {
561 fprintf(stderr, " %14Ld %-20s (events)",
562 count[0], event_name(counter));
563 }
564 if (scaled)
565 fprintf(stderr, " (scaled from %.2f%%)",
566 (double) count[2] / count[1] * 100);
567 fprintf(stderr, "\n");
568 }
569 fprintf(stderr, "\n");
570 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
571 (double)(t1-t0)/1e6);
572 fprintf(stderr, "\n");
573
574 return 0;
575}
576
577/*
578 * Symbols
579 */
580
581static uint64_t min_ip;
582static uint64_t max_ip = -1ll;
583
584struct sym_entry {
585 unsigned long long addr;
586 char *sym;
587 unsigned long count[MAX_COUNTERS];
588 int skip;
589 struct source_line *source;
590};
591
592#define MAX_SYMS 100000
593
594static int sym_table_count;
595
596struct sym_entry *sym_filter_entry;
597
598static struct sym_entry sym_table[MAX_SYMS];
599
600static void show_details(struct sym_entry *sym);
601
602/*
603 * Ordering weight: count-1 * count-2 * ... / count-n
604 */
605static double sym_weight(const struct sym_entry *sym)
606{
607 double weight;
608 int counter;
609
610 weight = sym->count[0];
611
612 for (counter = 1; counter < nr_counters-1; counter++)
613 weight *= sym->count[counter];
614
615 weight /= (sym->count[counter] + 1);
616
617 return weight;
618}
619
620static int compare(const void *__sym1, const void *__sym2)
621{
622 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
623
624 return sym_weight(sym1) < sym_weight(sym2);
625}
626
627static long events;
628static long userspace_events;
629static const char CONSOLE_CLEAR[] = "";
630
631static struct sym_entry tmp[MAX_SYMS];
632
633static void print_sym_table(void)
634{
635 int i, printed;
636 int counter;
637 float events_per_sec = events/delay_secs;
638 float kevents_per_sec = (events-userspace_events)/delay_secs;
639 float sum_kevents = 0.0;
640
641 events = userspace_events = 0;
642 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
643 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
644
645 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
646 sum_kevents += tmp[i].count[0];
647
648 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
649
650 printf(
651"------------------------------------------------------------------------------\n");
652 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
653 events_per_sec,
654 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
655 nmi ? "NMI" : "IRQ");
656
657 if (nr_counters == 1)
658 printf("%d ", event_count[0]);
659
660 for (counter = 0; counter < nr_counters; counter++) {
661 if (counter)
662 printf("/");
663
664 printf("%s", event_name(counter));
665 }
666
667 printf( "], ");
668
669 if (tid != -1)
670 printf(" (tid: %d", tid);
671 else
672 printf(" (all");
673
674 if (profile_cpu != -1)
675 printf(", cpu: %d)\n", profile_cpu);
676 else {
677 if (tid != -1)
678 printf(")\n");
679 else
680 printf(", %d CPUs)\n", nr_cpus);
681 }
682
683 printf("------------------------------------------------------------------------------\n\n");
684
685 if (nr_counters == 1)
686 printf(" events pcnt");
687 else
688 printf(" weight events pcnt");
689
690 printf(" RIP kernel function\n"
691 " ______ ______ _____ ________________ _______________\n\n"
692 );
693
694 for (i = 0, printed = 0; i < sym_table_count; i++) {
695 float pcnt;
696 int count;
697
698 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
699 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
700
701 if (nr_counters == 1)
702 printf("%19.2f - %4.1f%% - %016llx : %s\n",
703 sym_weight(tmp + i),
704 pcnt, tmp[i].addr, tmp[i].sym);
705 else
706 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
707 sym_weight(tmp + i),
708 tmp[i].count[0],
709 pcnt, tmp[i].addr, tmp[i].sym);
710 printed++;
711 }
712 /*
713 * Add decay to the counts:
714 */
715 for (count = 0; count < nr_counters; count++)
716 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
717 }
718
719 if (sym_filter_entry)
720 show_details(sym_filter_entry);
721
722 {
723 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
724
725 if (poll(&stdin_poll, 1, 0) == 1) {
726 printf("key pressed - exiting.\n");
727 exit(0);
728 }
729 }
730}
731
732static void *display_thread(void *arg)
733{
734 printf("KernelTop refresh period: %d seconds\n", delay_secs);
735
736 while (!sleep(delay_secs))
737 print_sym_table();
738
739 return NULL;
740}
741
742static int read_symbol(FILE *in, struct sym_entry *s)
743{
744 static int filter_match = 0;
745 char *sym, stype;
746 char str[500];
747 int rc, pos;
748
749 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
750 if (rc == EOF)
751 return -1;
752
753 assert(rc == 3);
754
755 /* skip until end of line: */
756 pos = strlen(str);
757 do {
758 rc = fgetc(in);
759 if (rc == '\n' || rc == EOF || pos >= 499)
760 break;
761 str[pos] = rc;
762 pos++;
763 } while (1);
764 str[pos] = 0;
765
766 sym = str;
767
768 /* Filter out known duplicates and non-text symbols. */
769 if (!strcmp(sym, "_text"))
770 return 1;
771 if (!min_ip && !strcmp(sym, "_stext"))
772 return 1;
773 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
774 return 1;
775 if (stype != 'T' && stype != 't')
776 return 1;
777 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
778 return 1;
779 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
780 return 1;
781
782 s->sym = malloc(strlen(str));
783 assert(s->sym);
784
785 strcpy((char *)s->sym, str);
786 s->skip = 0;
787
788 /* Tag events to be skipped. */
789 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
790 s->skip = 1;
791 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
792 s->skip = 1;
793 else if (!strcmp("mwait_idle", s->sym))
794 s->skip = 1;
795
796 if (filter_match == 1) {
797 filter_end = s->addr;
798 filter_match = -1;
799 if (filter_end - filter_start > 10000) {
800 printf("hm, too large filter symbol <%s> - skipping.\n",
801 sym_filter);
802 printf("symbol filter start: %016lx\n", filter_start);
803 printf(" end: %016lx\n", filter_end);
804 filter_end = filter_start = 0;
805 sym_filter = NULL;
806 sleep(1);
807 }
808 }
809 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
810 filter_match = 1;
811 filter_start = s->addr;
812 }
813
814 return 0;
815}
816
817int compare_addr(const void *__sym1, const void *__sym2)
818{
819 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
820
821 return sym1->addr > sym2->addr;
822}
823
824static void sort_symbol_table(void)
825{
826 int i, dups;
827
828 do {
829 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
830 for (i = 0, dups = 0; i < sym_table_count; i++) {
831 if (sym_table[i].addr == sym_table[i+1].addr) {
832 sym_table[i+1].addr = -1ll;
833 dups++;
834 }
835 }
836 sym_table_count -= dups;
837 } while(dups);
838}
839
840static void parse_symbols(void)
841{
842 struct sym_entry *last;
843
844 FILE *kallsyms = fopen("/proc/kallsyms", "r");
845
846 if (!kallsyms) {
847 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
848 exit(-1);
849 }
850
851 while (!feof(kallsyms)) {
852 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
853 sym_table_count++;
854 assert(sym_table_count <= MAX_SYMS);
855 }
856 }
857
858 sort_symbol_table();
859 min_ip = sym_table[0].addr;
860 max_ip = sym_table[sym_table_count-1].addr;
861 last = sym_table + sym_table_count++;
862
863 last->addr = -1ll;
864 last->sym = "<end>";
865
866 if (filter_end) {
867 int count;
868 for (count=0; count < sym_table_count; count ++) {
869 if (!strcmp(sym_table[count].sym, sym_filter)) {
870 sym_filter_entry = &sym_table[count];
871 break;
872 }
873 }
874 }
875 if (dump_symtab) {
876 int i;
877
878 for (i = 0; i < sym_table_count; i++)
879 fprintf(stderr, "%llx %s\n",
880 sym_table[i].addr, sym_table[i].sym);
881 }
882}
883
884/*
885 * Source lines
886 */
887
888static void parse_vmlinux(char *filename)
889{
890 FILE *file;
891 char command[PATH_MAX*2];
892 if (!filename)
893 return;
894
895 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
896
897 file = popen(command, "r");
898 if (!file)
899 return;
900
901 lines_tail = &lines;
902 while (!feof(file)) {
903 struct source_line *src;
904 size_t dummy = 0;
905 char *c;
906
907 src = malloc(sizeof(struct source_line));
908 assert(src != NULL);
909 memset(src, 0, sizeof(struct source_line));
910
911 if (getline(&src->line, &dummy, file) < 0)
912 break;
913 if (!src->line)
914 break;
915
916 c = strchr(src->line, '\n');
917 if (c)
918 *c = 0;
919
920 src->next = NULL;
921 *lines_tail = src;
922 lines_tail = &src->next;
923
924 if (strlen(src->line)>8 && src->line[8] == ':')
925 src->EIP = strtoull(src->line, NULL, 16);
926 if (strlen(src->line)>8 && src->line[16] == ':')
927 src->EIP = strtoull(src->line, NULL, 16);
928 }
929 pclose(file);
930}
931
932static void record_precise_ip(uint64_t ip)
933{
934 struct source_line *line;
935
936 for (line = lines; line; line = line->next) {
937 if (line->EIP == ip)
938 line->count++;
939 if (line->EIP > ip)
940 break;
941 }
942}
943
944static void lookup_sym_in_vmlinux(struct sym_entry *sym)
945{
946 struct source_line *line;
947 char pattern[PATH_MAX];
948 sprintf(pattern, "<%s>:", sym->sym);
949
950 for (line = lines; line; line = line->next) {
951 if (strstr(line->line, pattern)) {
952 sym->source = line;
953 break;
954 }
955 }
956}
957
958static void show_lines(struct source_line *line_queue, int line_queue_count)
959{
960 int i;
961 struct source_line *line;
962
963 line = line_queue;
964 for (i = 0; i < line_queue_count; i++) {
965 printf("%8li\t%s\n", line->count, line->line);
966 line = line->next;
967 }
968}
969
970#define TRACE_COUNT 3
971
972static void show_details(struct sym_entry *sym)
973{
974 struct source_line *line;
975 struct source_line *line_queue = NULL;
976 int displayed = 0;
977 int line_queue_count = 0;
978
979 if (!sym->source)
980 lookup_sym_in_vmlinux(sym);
981 if (!sym->source)
982 return;
983
984 printf("Showing details for %s\n", sym->sym);
985
986 line = sym->source;
987 while (line) {
988 if (displayed && strstr(line->line, ">:"))
989 break;
990
991 if (!line_queue_count)
992 line_queue = line;
993 line_queue_count ++;
994
995 if (line->count >= count_filter) {
996 show_lines(line_queue, line_queue_count);
997 line_queue_count = 0;
998 line_queue = NULL;
999 } else if (line_queue_count > TRACE_COUNT) {
1000 line_queue = line_queue->next;
1001 line_queue_count --;
1002 }
1003
1004 line->count = 0;
1005 displayed++;
1006 if (displayed > 300)
1007 break;
1008 line = line->next;
1009 }
1010}
1011
1012/*
1013 * Binary search in the histogram table and record the hit:
1014 */
1015static void record_ip(uint64_t ip, int counter)
1016{
1017 int left_idx, middle_idx, right_idx, idx;
1018 unsigned long left, middle, right;
1019
1020 record_precise_ip(ip);
1021
1022 left_idx = 0;
1023 right_idx = sym_table_count-1;
1024 assert(ip <= max_ip && ip >= min_ip);
1025
1026 while (left_idx + 1 < right_idx) {
1027 middle_idx = (left_idx + right_idx) / 2;
1028
1029 left = sym_table[ left_idx].addr;
1030 middle = sym_table[middle_idx].addr;
1031 right = sym_table[ right_idx].addr;
1032
1033 if (!(left <= middle && middle <= right)) {
1034 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1035 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1036 }
1037 assert(left <= middle && middle <= right);
1038 if (!(left <= ip && ip <= right)) {
1039 printf(" left: %016lx\n", left);
1040 printf(" ip: %016lx\n", (unsigned long)ip);
1041 printf("right: %016lx\n", right);
1042 }
1043 assert(left <= ip && ip <= right);
1044 /*
1045 * [ left .... target .... middle .... right ]
1046 * => right := middle
1047 */
1048 if (ip < middle) {
1049 right_idx = middle_idx;
1050 continue;
1051 }
1052 /*
1053 * [ left .... middle ... target ... right ]
1054 * => left := middle
1055 */
1056 left_idx = middle_idx;
1057 }
1058
1059 idx = left_idx;
1060
1061 if (!sym_table[idx].skip)
1062 sym_table[idx].count[counter]++;
1063 else events--;
1064}
1065
1066static void process_event(uint64_t ip, int counter)
1067{
1068 events++;
1069
1070 if (ip < min_ip || ip > max_ip) {
1071 userspace_events++;
1072 return;
1073 }
1074
1075 record_ip(ip, counter);
1076}
1077
1078static void process_options(int argc, char *argv[])
1079{
1080 int error = 0, counter;
1081
1082 if (strstr(argv[0], "perfstat"))
1083 run_perfstat = 1;
1084
1085 for (;;) {
1086 int option_index = 0;
1087 /** Options for getopt */
1088 static struct option long_options[] = {
1089 {"count", required_argument, NULL, 'c'},
1090 {"cpu", required_argument, NULL, 'C'},
1091 {"delay", required_argument, NULL, 'd'},
1092 {"dump_symtab", no_argument, NULL, 'D'},
1093 {"event", required_argument, NULL, 'e'},
1094 {"filter", required_argument, NULL, 'f'},
1095 {"group", required_argument, NULL, 'g'},
1096 {"help", no_argument, NULL, 'h'},
1097 {"nmi", required_argument, NULL, 'n'},
1098 {"mmap_info", no_argument, NULL, 'M'},
1099 {"mmap_pages", required_argument, NULL, 'm'},
1100 {"munmap_info", no_argument, NULL, 'U'},
1101 {"pid", required_argument, NULL, 'p'},
1102 {"realtime", required_argument, NULL, 'r'},
1103 {"scale", no_argument, NULL, 'l'},
1104 {"symbol", required_argument, NULL, 's'},
1105 {"stat", no_argument, NULL, 'S'},
1106 {"vmlinux", required_argument, NULL, 'x'},
1107 {"zero", no_argument, NULL, 'z'},
1108 {NULL, 0, NULL, 0 }
1109 };
1110 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1111 long_options, &option_index);
1112 if (c == -1)
1113 break;
1114
1115 switch (c) {
1116 case 'a': system_wide = 1; break;
1117 case 'c': default_interval = atoi(optarg); break;
1118 case 'C':
1119 /* CPU and PID are mutually exclusive */
1120 if (tid != -1) {
1121 printf("WARNING: CPU switch overriding PID\n");
1122 sleep(1);
1123 tid = -1;
1124 }
1125 profile_cpu = atoi(optarg); break;
1126 case 'd': delay_secs = atoi(optarg); break;
1127 case 'D': dump_symtab = 1; break;
1128
1129 case 'e': error = parse_events(optarg); break;
1130
1131 case 'f': count_filter = atoi(optarg); break;
1132 case 'g': group = atoi(optarg); break;
1133 case 'h': display_help(); break;
1134 case 'l': scale = 1; break;
1135 case 'n': nmi = atoi(optarg); break;
1136 case 'p':
1137 /* CPU and PID are mutually exclusive */
1138 if (profile_cpu != -1) {
1139 printf("WARNING: PID switch overriding CPU\n");
1140 sleep(1);
1141 profile_cpu = -1;
1142 }
1143 tid = atoi(optarg); break;
1144 case 'r': realtime_prio = atoi(optarg); break;
1145 case 's': sym_filter = strdup(optarg); break;
1146 case 'S': run_perfstat = 1; break;
1147 case 'x': vmlinux = strdup(optarg); break;
1148 case 'z': zero = 1; break;
1149 case 'm': mmap_pages = atoi(optarg); break;
1150 case 'M': use_mmap = 1; break;
1151 case 'U': use_munmap = 1; break;
1152 default: error = 1; break;
1153 }
1154 }
1155 if (error)
1156 display_help();
1157
1158 if (!nr_counters) {
1159 if (run_perfstat)
1160 nr_counters = 8;
1161 else {
1162 nr_counters = 1;
1163 event_id[0] = 0;
1164 }
1165 }
1166
1167 for (counter = 0; counter < nr_counters; counter++) {
1168 if (event_count[counter])
1169 continue;
1170
1171 event_count[counter] = default_interval;
1172 }
1173}
1174
1175struct mmap_data {
1176 int counter;
1177 void *base;
1178 unsigned int mask;
1179 unsigned int prev;
1180};
1181
1182static unsigned int mmap_read_head(struct mmap_data *md)
1183{
1184 struct perf_counter_mmap_page *pc = md->base;
1185 int head;
1186
1187 head = pc->data_head;
1188 rmb();
1189
1190 return head;
1191}
1192
1193struct timeval last_read, this_read;
1194
1195static void mmap_read(struct mmap_data *md)
1196{
1197 unsigned int head = mmap_read_head(md);
1198 unsigned int old = md->prev;
1199 unsigned char *data = md->base + page_size;
1200 int diff;
1201
1202 gettimeofday(&this_read, NULL);
1203
1204 /*
1205 * If we're further behind than half the buffer, there's a chance
1206 * the writer will bite our tail and screw up the events under us.
1207 *
1208 * If we somehow ended up ahead of the head, we got messed up.
1209 *
1210 * In either case, truncate and restart at head.
1211 */
1212 diff = head - old;
1213 if (diff > md->mask / 2 || diff < 0) {
1214 struct timeval iv;
1215 unsigned long msecs;
1216
1217 timersub(&this_read, &last_read, &iv);
1218 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1219
1220 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1221 " Last read %lu msecs ago.\n", msecs);
1222
1223 /*
1224 * head points to a known good entry, start there.
1225 */
1226 old = head;
1227 }
1228
1229 last_read = this_read;
1230
1231 for (; old != head;) {
1232 struct ip_event {
1233 struct perf_event_header header;
1234 __u64 ip;
1235 __u32 pid, tid;
1236 };
1237 struct mmap_event {
1238 struct perf_event_header header;
1239 __u32 pid, tid;
1240 __u64 start;
1241 __u64 len;
1242 __u64 pgoff;
1243 char filename[PATH_MAX];
1244 };
1245
1246 typedef union event_union {
1247 struct perf_event_header header;
1248 struct ip_event ip;
1249 struct mmap_event mmap;
1250 } event_t;
1251
1252 event_t *event = (event_t *)&data[old & md->mask];
1253
1254 event_t event_copy;
1255
1256 unsigned int size = event->header.size;
1257
1258 /*
1259 * Event straddles the mmap boundary -- header should always
1260 * be inside due to u64 alignment of output.
1261 */
1262 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1263 unsigned int offset = old;
1264 unsigned int len = min(sizeof(*event), size), cpy;
1265 void *dst = &event_copy;
1266
1267 do {
1268 cpy = min(md->mask + 1 - (offset & md->mask), len);
1269 memcpy(dst, &data[offset & md->mask], cpy);
1270 offset += cpy;
1271 dst += cpy;
1272 len -= cpy;
1273 } while (len);
1274
1275 event = &event_copy;
1276 }
1277
1278 old += size;
1279
1280 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1281 if (event->header.type & PERF_RECORD_IP)
1282 process_event(event->ip.ip, md->counter);
1283 } else {
1284 switch (event->header.type) {
1285 case PERF_EVENT_MMAP:
1286 case PERF_EVENT_MUNMAP:
1287 printf("%s: %Lu %Lu %Lu %s\n",
1288 event->header.type == PERF_EVENT_MMAP
1289 ? "mmap" : "munmap",
1290 event->mmap.start,
1291 event->mmap.len,
1292 event->mmap.pgoff,
1293 event->mmap.filename);
1294 break;
1295 }
1296 }
1297 }
1298
1299 md->prev = old;
1300}
1301
1302int main(int argc, char *argv[])
1303{
1304 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1305 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1306 struct perf_counter_hw_event hw_event;
1307 pthread_t thread;
1308 int i, counter, group_fd, nr_poll = 0;
1309 unsigned int cpu;
1310 int ret;
1311
1312 page_size = sysconf(_SC_PAGE_SIZE);
1313
1314 process_options(argc, argv);
1315
1316 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1317 assert(nr_cpus <= MAX_NR_CPUS);
1318 assert(nr_cpus >= 0);
1319
1320 if (run_perfstat)
1321 return do_perfstat(argc, argv);
1322
1323 if (tid != -1 || profile_cpu != -1)
1324 nr_cpus = 1;
1325
1326 parse_symbols();
1327 if (vmlinux && sym_filter_entry)
1328 parse_vmlinux(vmlinux);
1329
1330 for (i = 0; i < nr_cpus; i++) {
1331 group_fd = -1;
1332 for (counter = 0; counter < nr_counters; counter++) {
1333
1334 cpu = profile_cpu;
1335 if (tid == -1 && profile_cpu == -1)
1336 cpu = i;
1337
1338 memset(&hw_event, 0, sizeof(hw_event));
1339 hw_event.config = event_id[counter];
1340 hw_event.irq_period = event_count[counter];
1341 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1342 hw_event.nmi = nmi;
1343 hw_event.mmap = use_mmap;
1344 hw_event.munmap = use_munmap;
1345
1346 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1347 if (fd[i][counter] < 0) {
1348 int err = errno;
1349 printf("kerneltop error: syscall returned with %d (%s)\n",
1350 fd[i][counter], strerror(err));
1351 if (err == EPERM)
1352 printf("Are you root?\n");
1353 exit(-1);
1354 }
1355 assert(fd[i][counter] >= 0);
1356 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1357
1358 /*
1359 * First counter acts as the group leader:
1360 */
1361 if (group && group_fd == -1)
1362 group_fd = fd[i][counter];
1363
1364 event_array[nr_poll].fd = fd[i][counter];
1365 event_array[nr_poll].events = POLLIN;
1366 nr_poll++;
1367
1368 mmap_array[i][counter].counter = counter;
1369 mmap_array[i][counter].prev = 0;
1370 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1371 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1372 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1373 if (mmap_array[i][counter].base == MAP_FAILED) {
1374 printf("kerneltop error: failed to mmap with %d (%s)\n",
1375 errno, strerror(errno));
1376 exit(-1);
1377 }
1378 }
1379 }
1380
1381 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1382 printf("Could not create display thread.\n");
1383 exit(-1);
1384 }
1385
1386 if (realtime_prio) {
1387 struct sched_param param;
1388
1389 param.sched_priority = realtime_prio;
1390 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1391 printf("Could not set realtime priority.\n");
1392 exit(-1);
1393 }
1394 }
1395
1396 while (1) {
1397 int hits = events;
1398
1399 for (i = 0; i < nr_cpus; i++) {
1400 for (counter = 0; counter < nr_counters; counter++)
1401 mmap_read(&mmap_array[i][counter]);
1402 }
1403
1404 if (hits == events)
1405 ret = poll(event_array, nr_poll, 100);
1406 }
1407
1408 return 0;
1409}