aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-20 09:37:32 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-20 11:36:48 -0400
commitddcacfa0febff6454dba6cea1931f3020a9f6c24 (patch)
tree0921fe8644ef904ad154640a584830d552a02bee
parent6f06ccbc86f8a02aa32271263249657ce484eb25 (diff)
perf_counter tools: separate kerneltop into 'perf top' and 'perf stat'
Lets use the Git framework of built-in commands. Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--Documentation/perf_counter/Makefile1
-rw-r--r--Documentation/perf_counter/builtin-stat.c592
-rw-r--r--Documentation/perf_counter/builtin-top.c204
-rw-r--r--Documentation/perf_counter/builtin.h3
-rw-r--r--Documentation/perf_counter/command-list.txt1
-rw-r--r--Documentation/perf_counter/kerneltop.c1409
-rw-r--r--Documentation/perf_counter/perf.c1
7 files changed, 601 insertions, 1610 deletions
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 1b6026555547..fb8b71744e59 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -309,6 +309,7 @@ LIB_OBJS += usage.o
309LIB_OBJS += wrapper.o 309LIB_OBJS += wrapper.o
310 310
311BUILTIN_OBJS += builtin-help.o 311BUILTIN_OBJS += builtin-help.o
312BUILTIN_OBJS += builtin-stat.o
312BUILTIN_OBJS += builtin-top.o 313BUILTIN_OBJS += builtin-top.o
313 314
314PERFLIBS = $(LIB_FILE) 315PERFLIBS = $(LIB_FILE)
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 000000000000..169a2d1783fc
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,592 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util.h"
65
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <ctype.h>
72#include <time.h>
73#include <sched.h>
74#include <pthread.h>
75
76#include <sys/syscall.h>
77#include <sys/ioctl.h>
78#include <sys/poll.h>
79#include <sys/prctl.h>
80#include <sys/wait.h>
81#include <sys/uio.h>
82#include <sys/mman.h>
83
84#include <linux/unistd.h>
85#include <linux/types.h>
86
87#include "../../include/linux/perf_counter.h"
88
89
90/*
91 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
92 * counters in the current task.
93 */
94#define PR_TASK_PERF_COUNTERS_DISABLE 31
95#define PR_TASK_PERF_COUNTERS_ENABLE 32
96
97#define rdclock() \
98({ \
99 struct timespec ts; \
100 \
101 clock_gettime(CLOCK_MONOTONIC, &ts); \
102 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
103})
104
105/*
106 * Pick up some kernel type conventions:
107 */
108#define __user
109#define asmlinkage
110
111#ifdef __x86_64__
112#define __NR_perf_counter_open 295
113#define rmb() asm volatile("lfence" ::: "memory")
114#define cpu_relax() asm volatile("rep; nop" ::: "memory");
115#endif
116
117#ifdef __i386__
118#define __NR_perf_counter_open 333
119#define rmb() asm volatile("lfence" ::: "memory")
120#define cpu_relax() asm volatile("rep; nop" ::: "memory");
121#endif
122
123#ifdef __powerpc__
124#define __NR_perf_counter_open 319
125#define rmb() asm volatile ("sync" ::: "memory")
126#define cpu_relax() asm volatile ("" ::: "memory");
127#endif
128
129#define unlikely(x) __builtin_expect(!!(x), 0)
130#define min(x, y) ({ \
131 typeof(x) _min1 = (x); \
132 typeof(y) _min2 = (y); \
133 (void) (&_min1 == &_min2); \
134 _min1 < _min2 ? _min1 : _min2; })
135
136extern asmlinkage int sys_perf_counter_open(
137 struct perf_counter_hw_event *hw_event_uptr __user,
138 pid_t pid,
139 int cpu,
140 int group_fd,
141 unsigned long flags);
142
143#define MAX_COUNTERS 64
144#define MAX_NR_CPUS 256
145
146#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
147
148static int system_wide = 0;
149
150static int nr_counters = 0;
151static __u64 event_id[MAX_COUNTERS] = {
152 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
153 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
154 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
155 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
156
157 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
158 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
159 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
160 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
161};
162static int default_interval = 100000;
163static int event_count[MAX_COUNTERS];
164static int fd[MAX_NR_CPUS][MAX_COUNTERS];
165
166static int tid = -1;
167static int profile_cpu = -1;
168static int nr_cpus = 0;
169static int nmi = 1;
170static int group = 0;
171static unsigned int page_size;
172
173static int zero;
174
175static int scale;
176
177static const unsigned int default_count[] = {
178 1000000,
179 1000000,
180 10000,
181 10000,
182 1000000,
183 10000,
184};
185
186static char *hw_event_names[] = {
187 "CPU cycles",
188 "instructions",
189 "cache references",
190 "cache misses",
191 "branches",
192 "branch misses",
193 "bus cycles",
194};
195
196static char *sw_event_names[] = {
197 "cpu clock ticks",
198 "task clock ticks",
199 "pagefaults",
200 "context switches",
201 "CPU migrations",
202 "minor faults",
203 "major faults",
204};
205
206struct event_symbol {
207 __u64 event;
208 char *symbol;
209};
210
211static struct event_symbol event_symbols[] = {
212 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
213 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
214 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
215 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
216 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
217 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
218 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
219 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
220 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
221
222 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
223 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
224 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
225 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
226 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
227 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
228 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
229 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
231 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
232};
233
234#define __PERF_COUNTER_FIELD(config, name) \
235 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
236
237#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
238#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
239#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
240#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
241
242static void display_events_help(void)
243{
244 unsigned int i;
245 __u64 e;
246
247 printf(
248 " -e EVENT --event=EVENT # symbolic-name abbreviations");
249
250 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
251 int type, id;
252
253 e = event_symbols[i].event;
254 type = PERF_COUNTER_TYPE(e);
255 id = PERF_COUNTER_ID(e);
256
257 printf("\n %d:%d: %-20s",
258 type, id, event_symbols[i].symbol);
259 }
260
261 printf("\n"
262 " rNNN: raw PMU events (eventsel+umask)\n\n");
263}
264
265static void display_help(void)
266{
267 printf(
268 "Usage: perfstat [<events...>] <cmd...>\n\n"
269 "PerfStat Options (up to %d event types can be specified):\n\n",
270 MAX_COUNTERS);
271
272 display_events_help();
273
274 printf(
275 " -l # scale counter values\n"
276 " -a # system-wide collection\n");
277 exit(0);
278}
279
280static char *event_name(int ctr)
281{
282 __u64 config = event_id[ctr];
283 int type = PERF_COUNTER_TYPE(config);
284 int id = PERF_COUNTER_ID(config);
285 static char buf[32];
286
287 if (PERF_COUNTER_RAW(config)) {
288 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
289 return buf;
290 }
291
292 switch (type) {
293 case PERF_TYPE_HARDWARE:
294 if (id < PERF_HW_EVENTS_MAX)
295 return hw_event_names[id];
296 return "unknown-hardware";
297
298 case PERF_TYPE_SOFTWARE:
299 if (id < PERF_SW_EVENTS_MAX)
300 return sw_event_names[id];
301 return "unknown-software";
302
303 default:
304 break;
305 }
306
307 return "unknown";
308}
309
310/*
311 * Each event can have multiple symbolic names.
312 * Symbolic names are (almost) exactly matched.
313 */
314static __u64 match_event_symbols(char *str)
315{
316 __u64 config, id;
317 int type;
318 unsigned int i;
319
320 if (sscanf(str, "r%llx", &config) == 1)
321 return config | PERF_COUNTER_RAW_MASK;
322
323 if (sscanf(str, "%d:%llu", &type, &id) == 2)
324 return EID(type, id);
325
326 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
327 if (!strncmp(str, event_symbols[i].symbol,
328 strlen(event_symbols[i].symbol)))
329 return event_symbols[i].event;
330 }
331
332 return ~0ULL;
333}
334
335static int parse_events(char *str)
336{
337 __u64 config;
338
339again:
340 if (nr_counters == MAX_COUNTERS)
341 return -1;
342
343 config = match_event_symbols(str);
344 if (config == ~0ULL)
345 return -1;
346
347 event_id[nr_counters] = config;
348 nr_counters++;
349
350 str = strstr(str, ",");
351 if (str) {
352 str++;
353 goto again;
354 }
355
356 return 0;
357}
358
359
360/*
361 * perfstat
362 */
363
364char fault_here[1000000];
365
366static void create_perfstat_counter(int counter)
367{
368 struct perf_counter_hw_event hw_event;
369
370 memset(&hw_event, 0, sizeof(hw_event));
371 hw_event.config = event_id[counter];
372 hw_event.record_type = 0;
373 hw_event.nmi = 0;
374 if (scale)
375 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
376 PERF_FORMAT_TOTAL_TIME_RUNNING;
377
378 if (system_wide) {
379 int cpu;
380 for (cpu = 0; cpu < nr_cpus; cpu ++) {
381 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
382 if (fd[cpu][counter] < 0) {
383 printf("perfstat error: syscall returned with %d (%s)\n",
384 fd[cpu][counter], strerror(errno));
385 exit(-1);
386 }
387 }
388 } else {
389 hw_event.inherit = 1;
390 hw_event.disabled = 1;
391
392 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
393 if (fd[0][counter] < 0) {
394 printf("perfstat error: syscall returned with %d (%s)\n",
395 fd[0][counter], strerror(errno));
396 exit(-1);
397 }
398 }
399}
400
401int do_perfstat(int argc, char *argv[])
402{
403 unsigned long long t0, t1;
404 int counter;
405 ssize_t res;
406 int status;
407 int pid;
408
409 if (!system_wide)
410 nr_cpus = 1;
411
412 for (counter = 0; counter < nr_counters; counter++)
413 create_perfstat_counter(counter);
414
415 argc -= optind;
416 argv += optind;
417
418 if (!argc)
419 display_help();
420
421 /*
422 * Enable counters and exec the command:
423 */
424 t0 = rdclock();
425 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
426
427 if ((pid = fork()) < 0)
428 perror("failed to fork");
429 if (!pid) {
430 if (execvp(argv[0], argv)) {
431 perror(argv[0]);
432 exit(-1);
433 }
434 }
435 while (wait(&status) >= 0)
436 ;
437 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
438 t1 = rdclock();
439
440 fflush(stdout);
441
442 fprintf(stderr, "\n");
443 fprintf(stderr, " Performance counter stats for \'%s\':\n",
444 argv[0]);
445 fprintf(stderr, "\n");
446
447 for (counter = 0; counter < nr_counters; counter++) {
448 int cpu, nv;
449 __u64 count[3], single_count[3];
450 int scaled;
451
452 count[0] = count[1] = count[2] = 0;
453 nv = scale ? 3 : 1;
454 for (cpu = 0; cpu < nr_cpus; cpu ++) {
455 res = read(fd[cpu][counter],
456 single_count, nv * sizeof(__u64));
457 assert(res == nv * sizeof(__u64));
458
459 count[0] += single_count[0];
460 if (scale) {
461 count[1] += single_count[1];
462 count[2] += single_count[2];
463 }
464 }
465
466 scaled = 0;
467 if (scale) {
468 if (count[2] == 0) {
469 fprintf(stderr, " %14s %-20s\n",
470 "<not counted>", event_name(counter));
471 continue;
472 }
473 if (count[2] < count[1]) {
474 scaled = 1;
475 count[0] = (unsigned long long)
476 ((double)count[0] * count[1] / count[2] + 0.5);
477 }
478 }
479
480 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
481 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
482
483 double msecs = (double)count[0] / 1000000;
484
485 fprintf(stderr, " %14.6f %-20s (msecs)",
486 msecs, event_name(counter));
487 } else {
488 fprintf(stderr, " %14Ld %-20s (events)",
489 count[0], event_name(counter));
490 }
491 if (scaled)
492 fprintf(stderr, " (scaled from %.2f%%)",
493 (double) count[2] / count[1] * 100);
494 fprintf(stderr, "\n");
495 }
496 fprintf(stderr, "\n");
497 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
498 (double)(t1-t0)/1e6);
499 fprintf(stderr, "\n");
500
501 return 0;
502}
503
504static void process_options(int argc, char **argv)
505{
506 int error = 0, counter;
507
508 for (;;) {
509 int option_index = 0;
510 /** Options for getopt */
511 static struct option long_options[] = {
512 {"count", required_argument, NULL, 'c'},
513 {"cpu", required_argument, NULL, 'C'},
514 {"delay", required_argument, NULL, 'd'},
515 {"dump_symtab", no_argument, NULL, 'D'},
516 {"event", required_argument, NULL, 'e'},
517 {"filter", required_argument, NULL, 'f'},
518 {"group", required_argument, NULL, 'g'},
519 {"help", no_argument, NULL, 'h'},
520 {"nmi", required_argument, NULL, 'n'},
521 {"munmap_info", no_argument, NULL, 'U'},
522 {"pid", required_argument, NULL, 'p'},
523 {"realtime", required_argument, NULL, 'r'},
524 {"scale", no_argument, NULL, 'l'},
525 {"symbol", required_argument, NULL, 's'},
526 {"stat", no_argument, NULL, 'S'},
527 {"vmlinux", required_argument, NULL, 'x'},
528 {"zero", no_argument, NULL, 'z'},
529 {NULL, 0, NULL, 0 }
530 };
531 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
532 long_options, &option_index);
533 if (c == -1)
534 break;
535
536 switch (c) {
537 case 'a': system_wide = 1; break;
538 case 'c': default_interval = atoi(optarg); break;
539 case 'C':
540 /* CPU and PID are mutually exclusive */
541 if (tid != -1) {
542 printf("WARNING: CPU switch overriding PID\n");
543 sleep(1);
544 tid = -1;
545 }
546 profile_cpu = atoi(optarg); break;
547
548 case 'e': error = parse_events(optarg); break;
549
550 case 'g': group = atoi(optarg); break;
551 case 'h': display_help(); break;
552 case 'l': scale = 1; break;
553 case 'n': nmi = atoi(optarg); break;
554 case 'p':
555 /* CPU and PID are mutually exclusive */
556 if (profile_cpu != -1) {
557 printf("WARNING: PID switch overriding CPU\n");
558 sleep(1);
559 profile_cpu = -1;
560 }
561 tid = atoi(optarg); break;
562 case 'z': zero = 1; break;
563 default: error = 1; break;
564 }
565 }
566 if (error)
567 display_help();
568
569 if (!nr_counters) {
570 nr_counters = 8;
571 }
572
573 for (counter = 0; counter < nr_counters; counter++) {
574 if (event_count[counter])
575 continue;
576
577 event_count[counter] = default_interval;
578 }
579}
580
581int cmd_stat(int argc, char **argv, const char *prefix)
582{
583 page_size = sysconf(_SC_PAGE_SIZE);
584
585 process_options(argc, argv);
586
587 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
588 assert(nr_cpus <= MAX_NR_CPUS);
589 assert(nr_cpus >= 0);
590
591 return do_perfstat(argc, argv);
592}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 601bddbc30d5..98e8690b6bcb 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -28,25 +28,6 @@
28 8.54 - ffffffff805001a3 : ip_queue_xmit 28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */ 29 */
30 30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /* 31 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 32 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 * 33 *
@@ -149,7 +130,6 @@ asmlinkage int sys_perf_counter_open(
149 130
150#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) 131#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
151 132
152static int run_perfstat = 0;
153static int system_wide = 0; 133static int system_wide = 0;
154 134
155static int nr_counters = 0; 135static int nr_counters = 0;
@@ -203,7 +183,7 @@ struct source_line {
203static struct source_line *lines; 183static struct source_line *lines;
204static struct source_line **lines_tail; 184static struct source_line **lines_tail;
205 185
206const unsigned int default_count[] = { 186static const unsigned int default_count[] = {
207 1000000, 187 1000000,
208 1000000, 188 1000000,
209 10000, 189 10000,
@@ -291,26 +271,8 @@ static void display_events_help(void)
291 " rNNN: raw PMU events (eventsel+umask)\n\n"); 271 " rNNN: raw PMU events (eventsel+umask)\n\n");
292} 272}
293 273
294static void display_perfstat_help(void)
295{
296 printf(
297 "Usage: perfstat [<events...>] <cmd...>\n\n"
298 "PerfStat Options (up to %d event types can be specified):\n\n",
299 MAX_COUNTERS);
300
301 display_events_help();
302
303 printf(
304 " -l # scale counter values\n"
305 " -a # system-wide collection\n");
306 exit(0);
307}
308
309static void display_help(void) 274static void display_help(void)
310{ 275{
311 if (run_perfstat)
312 return display_perfstat_help();
313
314 printf( 276 printf(
315 "Usage: kerneltop [<options>]\n" 277 "Usage: kerneltop [<options>]\n"
316 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n" 278 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
@@ -320,8 +282,6 @@ static void display_help(void)
320 display_events_help(); 282 display_events_help();
321 283
322 printf( 284 printf(
323 " -S --stat # perfstat COMMAND\n"
324 " -a # system-wide collection (for perfstat)\n\n"
325 " -c CNT --count=CNT # event period to sample\n\n" 285 " -c CNT --count=CNT # event period to sample\n\n"
326 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" 286 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
327 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" 287 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
@@ -420,151 +380,6 @@ again:
420 return 0; 380 return 0;
421} 381}
422 382
423
424/*
425 * perfstat
426 */
427
428char fault_here[1000000];
429
430static void create_perfstat_counter(int counter)
431{
432 struct perf_counter_hw_event hw_event;
433
434 memset(&hw_event, 0, sizeof(hw_event));
435 hw_event.config = event_id[counter];
436 hw_event.record_type = 0;
437 hw_event.nmi = 0;
438 if (scale)
439 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
440 PERF_FORMAT_TOTAL_TIME_RUNNING;
441
442 if (system_wide) {
443 int cpu;
444 for (cpu = 0; cpu < nr_cpus; cpu ++) {
445 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
446 if (fd[cpu][counter] < 0) {
447 printf("perfstat error: syscall returned with %d (%s)\n",
448 fd[cpu][counter], strerror(errno));
449 exit(-1);
450 }
451 }
452 } else {
453 hw_event.inherit = 1;
454 hw_event.disabled = 1;
455
456 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
457 if (fd[0][counter] < 0) {
458 printf("perfstat error: syscall returned with %d (%s)\n",
459 fd[0][counter], strerror(errno));
460 exit(-1);
461 }
462 }
463}
464
465int do_perfstat(int argc, char *argv[])
466{
467 unsigned long long t0, t1;
468 int counter;
469 ssize_t res;
470 int status;
471 int pid;
472
473 if (!system_wide)
474 nr_cpus = 1;
475
476 for (counter = 0; counter < nr_counters; counter++)
477 create_perfstat_counter(counter);
478
479 argc -= optind;
480 argv += optind;
481
482 if (!argc)
483 display_help();
484
485 /*
486 * Enable counters and exec the command:
487 */
488 t0 = rdclock();
489 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
490
491 if ((pid = fork()) < 0)
492 perror("failed to fork");
493 if (!pid) {
494 if (execvp(argv[0], argv)) {
495 perror(argv[0]);
496 exit(-1);
497 }
498 }
499 while (wait(&status) >= 0)
500 ;
501 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
502 t1 = rdclock();
503
504 fflush(stdout);
505
506 fprintf(stderr, "\n");
507 fprintf(stderr, " Performance counter stats for \'%s\':\n",
508 argv[0]);
509 fprintf(stderr, "\n");
510
511 for (counter = 0; counter < nr_counters; counter++) {
512 int cpu, nv;
513 __u64 count[3], single_count[3];
514 int scaled;
515
516 count[0] = count[1] = count[2] = 0;
517 nv = scale ? 3 : 1;
518 for (cpu = 0; cpu < nr_cpus; cpu ++) {
519 res = read(fd[cpu][counter],
520 single_count, nv * sizeof(__u64));
521 assert(res == nv * sizeof(__u64));
522
523 count[0] += single_count[0];
524 if (scale) {
525 count[1] += single_count[1];
526 count[2] += single_count[2];
527 }
528 }
529
530 scaled = 0;
531 if (scale) {
532 if (count[2] == 0) {
533 fprintf(stderr, " %14s %-20s\n",
534 "<not counted>", event_name(counter));
535 continue;
536 }
537 if (count[2] < count[1]) {
538 scaled = 1;
539 count[0] = (unsigned long long)
540 ((double)count[0] * count[1] / count[2] + 0.5);
541 }
542 }
543
544 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
545 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
546
547 double msecs = (double)count[0] / 1000000;
548
549 fprintf(stderr, " %14.6f %-20s (msecs)",
550 msecs, event_name(counter));
551 } else {
552 fprintf(stderr, " %14Ld %-20s (events)",
553 count[0], event_name(counter));
554 }
555 if (scaled)
556 fprintf(stderr, " (scaled from %.2f%%)",
557 (double) count[2] / count[1] * 100);
558 fprintf(stderr, "\n");
559 }
560 fprintf(stderr, "\n");
561 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
562 (double)(t1-t0)/1e6);
563 fprintf(stderr, "\n");
564
565 return 0;
566}
567
568/* 383/*
569 * Symbols 384 * Symbols
570 */ 385 */
@@ -805,7 +620,7 @@ static int read_symbol(FILE *in, struct sym_entry *s)
805 return 0; 620 return 0;
806} 621}
807 622
808int compare_addr(const void *__sym1, const void *__sym2) 623static int compare_addr(const void *__sym1, const void *__sym2)
809{ 624{
810 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; 625 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
811 626
@@ -1070,9 +885,6 @@ static void process_options(int argc, char **argv)
1070{ 885{
1071 int error = 0, counter; 886 int error = 0, counter;
1072 887
1073 if (strstr(argv[0], "perfstat"))
1074 run_perfstat = 1;
1075
1076 for (;;) { 888 for (;;) {
1077 int option_index = 0; 889 int option_index = 0;
1078 /** Options for getopt */ 890 /** Options for getopt */
@@ -1134,7 +946,6 @@ static void process_options(int argc, char **argv)
1134 tid = atoi(optarg); break; 946 tid = atoi(optarg); break;
1135 case 'r': realtime_prio = atoi(optarg); break; 947 case 'r': realtime_prio = atoi(optarg); break;
1136 case 's': sym_filter = strdup(optarg); break; 948 case 's': sym_filter = strdup(optarg); break;
1137 case 'S': run_perfstat = 1; break;
1138 case 'x': vmlinux = strdup(optarg); break; 949 case 'x': vmlinux = strdup(optarg); break;
1139 case 'z': zero = 1; break; 950 case 'z': zero = 1; break;
1140 case 'm': mmap_pages = atoi(optarg); break; 951 case 'm': mmap_pages = atoi(optarg); break;
@@ -1147,12 +958,8 @@ static void process_options(int argc, char **argv)
1147 display_help(); 958 display_help();
1148 959
1149 if (!nr_counters) { 960 if (!nr_counters) {
1150 if (run_perfstat) 961 nr_counters = 1;
1151 nr_counters = 8; 962 event_id[0] = 0;
1152 else {
1153 nr_counters = 1;
1154 event_id[0] = 0;
1155 }
1156 } 963 }
1157 964
1158 for (counter = 0; counter < nr_counters; counter++) { 965 for (counter = 0; counter < nr_counters; counter++) {
@@ -1308,9 +1115,6 @@ int cmd_top(int argc, char **argv, const char *prefix)
1308 assert(nr_cpus <= MAX_NR_CPUS); 1115 assert(nr_cpus <= MAX_NR_CPUS);
1309 assert(nr_cpus >= 0); 1116 assert(nr_cpus >= 0);
1310 1117
1311 if (run_perfstat)
1312 return do_perfstat(argc, argv);
1313
1314 if (tid != -1 || profile_cpu != -1) 1118 if (tid != -1 || profile_cpu != -1)
1315 nr_cpus = 1; 1119 nr_cpus = 1;
1316 1120
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 41637444ce2d..a3bb6cd6bed3 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,5 +14,6 @@ extern void prune_packed_objects(int);
14extern int read_line_with_nul(char *buf, int size, FILE *file); 14extern int read_line_with_nul(char *buf, int size, FILE *file);
15extern int check_pager_config(const char *cmd); 15extern int check_pager_config(const char *cmd);
16 16
17extern int cmd_top(int argc, const char **argv, const char *prefix); 17extern int cmd_top(int argc, char **argv, const char *prefix);
18extern int cmd_stat(int argc, char **argv, const char *prefix);
18#endif 19#endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 1eab3659b206..52455d46bfb5 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,4 +1,5 @@
1# List of known perf commands. 1# List of known perf commands.
2# command name category [deprecated] [common] 2# command name category [deprecated] [common]
3perf-top mainporcelain common 3perf-top mainporcelain common
4perf-stat mainporcelain common
4 5
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
deleted file mode 100644
index 042c1b83a872..000000000000
--- a/Documentation/perf_counter/kerneltop.c
+++ /dev/null
@@ -1,1409 +0,0 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#define _GNU_SOURCE
65#include <sys/types.h>
66#include <sys/stat.h>
67#include <sys/time.h>
68#include <unistd.h>
69#include <stdint.h>
70#include <stdlib.h>
71#include <string.h>
72#include <limits.h>
73#include <getopt.h>
74#include <assert.h>
75#include <fcntl.h>
76#include <stdio.h>
77#include <errno.h>
78#include <ctype.h>
79#include <time.h>
80#include <sched.h>
81#include <pthread.h>
82
83#include <sys/syscall.h>
84#include <sys/ioctl.h>
85#include <sys/poll.h>
86#include <sys/prctl.h>
87#include <sys/wait.h>
88#include <sys/uio.h>
89#include <sys/mman.h>
90
91#include <linux/unistd.h>
92#include <linux/types.h>
93
94#include "../../include/linux/perf_counter.h"
95
96
97/*
98 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
99 * counters in the current task.
100 */
101#define PR_TASK_PERF_COUNTERS_DISABLE 31
102#define PR_TASK_PERF_COUNTERS_ENABLE 32
103
104#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
105
106#define rdclock() \
107({ \
108 struct timespec ts; \
109 \
110 clock_gettime(CLOCK_MONOTONIC, &ts); \
111 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
112})
113
114/*
115 * Pick up some kernel type conventions:
116 */
117#define __user
118#define asmlinkage
119
120#ifdef __x86_64__
121#define __NR_perf_counter_open 295
122#define rmb() asm volatile("lfence" ::: "memory")
123#define cpu_relax() asm volatile("rep; nop" ::: "memory");
124#endif
125
126#ifdef __i386__
127#define __NR_perf_counter_open 333
128#define rmb() asm volatile("lfence" ::: "memory")
129#define cpu_relax() asm volatile("rep; nop" ::: "memory");
130#endif
131
132#ifdef __powerpc__
133#define __NR_perf_counter_open 319
134#define rmb() asm volatile ("sync" ::: "memory")
135#define cpu_relax() asm volatile ("" ::: "memory");
136#endif
137
138#define unlikely(x) __builtin_expect(!!(x), 0)
139#define min(x, y) ({ \
140 typeof(x) _min1 = (x); \
141 typeof(y) _min2 = (y); \
142 (void) (&_min1 == &_min2); \
143 _min1 < _min2 ? _min1 : _min2; })
144
145asmlinkage int sys_perf_counter_open(
146 struct perf_counter_hw_event *hw_event_uptr __user,
147 pid_t pid,
148 int cpu,
149 int group_fd,
150 unsigned long flags)
151{
152 return syscall(
153 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154}
155
156#define MAX_COUNTERS 64
157#define MAX_NR_CPUS 256
158
159#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
160
161static int run_perfstat = 0;
162static int system_wide = 0;
163
164static int nr_counters = 0;
165static __u64 event_id[MAX_COUNTERS] = {
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
168 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
169 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
170
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
173 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
174 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
175};
176static int default_interval = 100000;
177static int event_count[MAX_COUNTERS];
178static int fd[MAX_NR_CPUS][MAX_COUNTERS];
179
180static __u64 count_filter = 100;
181
182static int tid = -1;
183static int profile_cpu = -1;
184static int nr_cpus = 0;
185static int nmi = 1;
186static unsigned int realtime_prio = 0;
187static int group = 0;
188static unsigned int page_size;
189static unsigned int mmap_pages = 16;
190static int use_mmap = 0;
191static int use_munmap = 0;
192
193static char *vmlinux;
194
195static char *sym_filter;
196static unsigned long filter_start;
197static unsigned long filter_end;
198
199static int delay_secs = 2;
200static int zero;
201static int dump_symtab;
202
203static int scale;
204
205struct source_line {
206 uint64_t EIP;
207 unsigned long count;
208 char *line;
209 struct source_line *next;
210};
211
212static struct source_line *lines;
213static struct source_line **lines_tail;
214
215const unsigned int default_count[] = {
216 1000000,
217 1000000,
218 10000,
219 10000,
220 1000000,
221 10000,
222};
223
224static char *hw_event_names[] = {
225 "CPU cycles",
226 "instructions",
227 "cache references",
228 "cache misses",
229 "branches",
230 "branch misses",
231 "bus cycles",
232};
233
234static char *sw_event_names[] = {
235 "cpu clock ticks",
236 "task clock ticks",
237 "pagefaults",
238 "context switches",
239 "CPU migrations",
240 "minor faults",
241 "major faults",
242};
243
244struct event_symbol {
245 __u64 event;
246 char *symbol;
247};
248
249static struct event_symbol event_symbols[] = {
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
258 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
259
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
269 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
270};
271
272#define __PERF_COUNTER_FIELD(config, name) \
273 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
274
275#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
276#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
277#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
278#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
279
280static void display_events_help(void)
281{
282 unsigned int i;
283 __u64 e;
284
285 printf(
286 " -e EVENT --event=EVENT # symbolic-name abbreviations");
287
288 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
289 int type, id;
290
291 e = event_symbols[i].event;
292 type = PERF_COUNTER_TYPE(e);
293 id = PERF_COUNTER_ID(e);
294
295 printf("\n %d:%d: %-20s",
296 type, id, event_symbols[i].symbol);
297 }
298
299 printf("\n"
300 " rNNN: raw PMU events (eventsel+umask)\n\n");
301}
302
303static void display_perfstat_help(void)
304{
305 printf(
306 "Usage: perfstat [<events...>] <cmd...>\n\n"
307 "PerfStat Options (up to %d event types can be specified):\n\n",
308 MAX_COUNTERS);
309
310 display_events_help();
311
312 printf(
313 " -l # scale counter values\n"
314 " -a # system-wide collection\n");
315 exit(0);
316}
317
318static void display_help(void)
319{
320 if (run_perfstat)
321 return display_perfstat_help();
322
323 printf(
324 "Usage: kerneltop [<options>]\n"
325 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
326 "KernelTop Options (up to %d event types can be specified at once):\n\n",
327 MAX_COUNTERS);
328
329 display_events_help();
330
331 printf(
332 " -S --stat # perfstat COMMAND\n"
333 " -a # system-wide collection (for perfstat)\n\n"
334 " -c CNT --count=CNT # event period to sample\n\n"
335 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
336 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
337 " -l # show scale factor for RR events\n"
338 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
339 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
340 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
341 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
342 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
343 " -z --zero # zero counts after display\n"
344 " -D --dump_symtab # dump symbol table to stderr on startup\n"
345 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
346 " -M --mmap_info # print mmap info stream\n"
347 " -U --munmap_info # print munmap info stream\n"
348 );
349
350 exit(0);
351}
352
353static char *event_name(int ctr)
354{
355 __u64 config = event_id[ctr];
356 int type = PERF_COUNTER_TYPE(config);
357 int id = PERF_COUNTER_ID(config);
358 static char buf[32];
359
360 if (PERF_COUNTER_RAW(config)) {
361 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
362 return buf;
363 }
364
365 switch (type) {
366 case PERF_TYPE_HARDWARE:
367 if (id < PERF_HW_EVENTS_MAX)
368 return hw_event_names[id];
369 return "unknown-hardware";
370
371 case PERF_TYPE_SOFTWARE:
372 if (id < PERF_SW_EVENTS_MAX)
373 return sw_event_names[id];
374 return "unknown-software";
375
376 default:
377 break;
378 }
379
380 return "unknown";
381}
382
383/*
384 * Each event can have multiple symbolic names.
385 * Symbolic names are (almost) exactly matched.
386 */
387static __u64 match_event_symbols(char *str)
388{
389 __u64 config, id;
390 int type;
391 unsigned int i;
392
393 if (sscanf(str, "r%llx", &config) == 1)
394 return config | PERF_COUNTER_RAW_MASK;
395
396 if (sscanf(str, "%d:%llu", &type, &id) == 2)
397 return EID(type, id);
398
399 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
400 if (!strncmp(str, event_symbols[i].symbol,
401 strlen(event_symbols[i].symbol)))
402 return event_symbols[i].event;
403 }
404
405 return ~0ULL;
406}
407
408static int parse_events(char *str)
409{
410 __u64 config;
411
412again:
413 if (nr_counters == MAX_COUNTERS)
414 return -1;
415
416 config = match_event_symbols(str);
417 if (config == ~0ULL)
418 return -1;
419
420 event_id[nr_counters] = config;
421 nr_counters++;
422
423 str = strstr(str, ",");
424 if (str) {
425 str++;
426 goto again;
427 }
428
429 return 0;
430}
431
432
433/*
434 * perfstat
435 */
436
437char fault_here[1000000];
438
439static void create_perfstat_counter(int counter)
440{
441 struct perf_counter_hw_event hw_event;
442
443 memset(&hw_event, 0, sizeof(hw_event));
444 hw_event.config = event_id[counter];
445 hw_event.record_type = 0;
446 hw_event.nmi = 0;
447 if (scale)
448 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
449 PERF_FORMAT_TOTAL_TIME_RUNNING;
450
451 if (system_wide) {
452 int cpu;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
455 if (fd[cpu][counter] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd[cpu][counter], strerror(errno));
458 exit(-1);
459 }
460 }
461 } else {
462 hw_event.inherit = 1;
463 hw_event.disabled = 1;
464
465 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
466 if (fd[0][counter] < 0) {
467 printf("perfstat error: syscall returned with %d (%s)\n",
468 fd[0][counter], strerror(errno));
469 exit(-1);
470 }
471 }
472}
473
474int do_perfstat(int argc, char *argv[])
475{
476 unsigned long long t0, t1;
477 int counter;
478 ssize_t res;
479 int status;
480 int pid;
481
482 if (!system_wide)
483 nr_cpus = 1;
484
485 for (counter = 0; counter < nr_counters; counter++)
486 create_perfstat_counter(counter);
487
488 argc -= optind;
489 argv += optind;
490
491 if (!argc)
492 display_help();
493
494 /*
495 * Enable counters and exec the command:
496 */
497 t0 = rdclock();
498 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
499
500 if ((pid = fork()) < 0)
501 perror("failed to fork");
502 if (!pid) {
503 if (execvp(argv[0], argv)) {
504 perror(argv[0]);
505 exit(-1);
506 }
507 }
508 while (wait(&status) >= 0)
509 ;
510 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
511 t1 = rdclock();
512
513 fflush(stdout);
514
515 fprintf(stderr, "\n");
516 fprintf(stderr, " Performance counter stats for \'%s\':\n",
517 argv[0]);
518 fprintf(stderr, "\n");
519
520 for (counter = 0; counter < nr_counters; counter++) {
521 int cpu, nv;
522 __u64 count[3], single_count[3];
523 int scaled;
524
525 count[0] = count[1] = count[2] = 0;
526 nv = scale ? 3 : 1;
527 for (cpu = 0; cpu < nr_cpus; cpu ++) {
528 res = read(fd[cpu][counter],
529 single_count, nv * sizeof(__u64));
530 assert(res == nv * sizeof(__u64));
531
532 count[0] += single_count[0];
533 if (scale) {
534 count[1] += single_count[1];
535 count[2] += single_count[2];
536 }
537 }
538
539 scaled = 0;
540 if (scale) {
541 if (count[2] == 0) {
542 fprintf(stderr, " %14s %-20s\n",
543 "<not counted>", event_name(counter));
544 continue;
545 }
546 if (count[2] < count[1]) {
547 scaled = 1;
548 count[0] = (unsigned long long)
549 ((double)count[0] * count[1] / count[2] + 0.5);
550 }
551 }
552
553 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
554 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
555
556 double msecs = (double)count[0] / 1000000;
557
558 fprintf(stderr, " %14.6f %-20s (msecs)",
559 msecs, event_name(counter));
560 } else {
561 fprintf(stderr, " %14Ld %-20s (events)",
562 count[0], event_name(counter));
563 }
564 if (scaled)
565 fprintf(stderr, " (scaled from %.2f%%)",
566 (double) count[2] / count[1] * 100);
567 fprintf(stderr, "\n");
568 }
569 fprintf(stderr, "\n");
570 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
571 (double)(t1-t0)/1e6);
572 fprintf(stderr, "\n");
573
574 return 0;
575}
576
577/*
578 * Symbols
579 */
580
581static uint64_t min_ip;
582static uint64_t max_ip = -1ll;
583
584struct sym_entry {
585 unsigned long long addr;
586 char *sym;
587 unsigned long count[MAX_COUNTERS];
588 int skip;
589 struct source_line *source;
590};
591
592#define MAX_SYMS 100000
593
594static int sym_table_count;
595
596struct sym_entry *sym_filter_entry;
597
598static struct sym_entry sym_table[MAX_SYMS];
599
600static void show_details(struct sym_entry *sym);
601
602/*
603 * Ordering weight: count-1 * count-2 * ... / count-n
604 */
605static double sym_weight(const struct sym_entry *sym)
606{
607 double weight;
608 int counter;
609
610 weight = sym->count[0];
611
612 for (counter = 1; counter < nr_counters-1; counter++)
613 weight *= sym->count[counter];
614
615 weight /= (sym->count[counter] + 1);
616
617 return weight;
618}
619
620static int compare(const void *__sym1, const void *__sym2)
621{
622 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
623
624 return sym_weight(sym1) < sym_weight(sym2);
625}
626
627static long events;
628static long userspace_events;
629static const char CONSOLE_CLEAR[] = "";
630
631static struct sym_entry tmp[MAX_SYMS];
632
633static void print_sym_table(void)
634{
635 int i, printed;
636 int counter;
637 float events_per_sec = events/delay_secs;
638 float kevents_per_sec = (events-userspace_events)/delay_secs;
639 float sum_kevents = 0.0;
640
641 events = userspace_events = 0;
642 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
643 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
644
645 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
646 sum_kevents += tmp[i].count[0];
647
648 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
649
650 printf(
651"------------------------------------------------------------------------------\n");
652 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
653 events_per_sec,
654 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
655 nmi ? "NMI" : "IRQ");
656
657 if (nr_counters == 1)
658 printf("%d ", event_count[0]);
659
660 for (counter = 0; counter < nr_counters; counter++) {
661 if (counter)
662 printf("/");
663
664 printf("%s", event_name(counter));
665 }
666
667 printf( "], ");
668
669 if (tid != -1)
670 printf(" (tid: %d", tid);
671 else
672 printf(" (all");
673
674 if (profile_cpu != -1)
675 printf(", cpu: %d)\n", profile_cpu);
676 else {
677 if (tid != -1)
678 printf(")\n");
679 else
680 printf(", %d CPUs)\n", nr_cpus);
681 }
682
683 printf("------------------------------------------------------------------------------\n\n");
684
685 if (nr_counters == 1)
686 printf(" events pcnt");
687 else
688 printf(" weight events pcnt");
689
690 printf(" RIP kernel function\n"
691 " ______ ______ _____ ________________ _______________\n\n"
692 );
693
694 for (i = 0, printed = 0; i < sym_table_count; i++) {
695 float pcnt;
696 int count;
697
698 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
699 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
700
701 if (nr_counters == 1)
702 printf("%19.2f - %4.1f%% - %016llx : %s\n",
703 sym_weight(tmp + i),
704 pcnt, tmp[i].addr, tmp[i].sym);
705 else
706 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
707 sym_weight(tmp + i),
708 tmp[i].count[0],
709 pcnt, tmp[i].addr, tmp[i].sym);
710 printed++;
711 }
712 /*
713 * Add decay to the counts:
714 */
715 for (count = 0; count < nr_counters; count++)
716 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
717 }
718
719 if (sym_filter_entry)
720 show_details(sym_filter_entry);
721
722 {
723 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
724
725 if (poll(&stdin_poll, 1, 0) == 1) {
726 printf("key pressed - exiting.\n");
727 exit(0);
728 }
729 }
730}
731
732static void *display_thread(void *arg)
733{
734 printf("KernelTop refresh period: %d seconds\n", delay_secs);
735
736 while (!sleep(delay_secs))
737 print_sym_table();
738
739 return NULL;
740}
741
742static int read_symbol(FILE *in, struct sym_entry *s)
743{
744 static int filter_match = 0;
745 char *sym, stype;
746 char str[500];
747 int rc, pos;
748
749 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
750 if (rc == EOF)
751 return -1;
752
753 assert(rc == 3);
754
755 /* skip until end of line: */
756 pos = strlen(str);
757 do {
758 rc = fgetc(in);
759 if (rc == '\n' || rc == EOF || pos >= 499)
760 break;
761 str[pos] = rc;
762 pos++;
763 } while (1);
764 str[pos] = 0;
765
766 sym = str;
767
768 /* Filter out known duplicates and non-text symbols. */
769 if (!strcmp(sym, "_text"))
770 return 1;
771 if (!min_ip && !strcmp(sym, "_stext"))
772 return 1;
773 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
774 return 1;
775 if (stype != 'T' && stype != 't')
776 return 1;
777 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
778 return 1;
779 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
780 return 1;
781
782 s->sym = malloc(strlen(str));
783 assert(s->sym);
784
785 strcpy((char *)s->sym, str);
786 s->skip = 0;
787
788 /* Tag events to be skipped. */
789 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
790 s->skip = 1;
791 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
792 s->skip = 1;
793 else if (!strcmp("mwait_idle", s->sym))
794 s->skip = 1;
795
796 if (filter_match == 1) {
797 filter_end = s->addr;
798 filter_match = -1;
799 if (filter_end - filter_start > 10000) {
800 printf("hm, too large filter symbol <%s> - skipping.\n",
801 sym_filter);
802 printf("symbol filter start: %016lx\n", filter_start);
803 printf(" end: %016lx\n", filter_end);
804 filter_end = filter_start = 0;
805 sym_filter = NULL;
806 sleep(1);
807 }
808 }
809 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
810 filter_match = 1;
811 filter_start = s->addr;
812 }
813
814 return 0;
815}
816
817int compare_addr(const void *__sym1, const void *__sym2)
818{
819 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
820
821 return sym1->addr > sym2->addr;
822}
823
824static void sort_symbol_table(void)
825{
826 int i, dups;
827
828 do {
829 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
830 for (i = 0, dups = 0; i < sym_table_count; i++) {
831 if (sym_table[i].addr == sym_table[i+1].addr) {
832 sym_table[i+1].addr = -1ll;
833 dups++;
834 }
835 }
836 sym_table_count -= dups;
837 } while(dups);
838}
839
840static void parse_symbols(void)
841{
842 struct sym_entry *last;
843
844 FILE *kallsyms = fopen("/proc/kallsyms", "r");
845
846 if (!kallsyms) {
847 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
848 exit(-1);
849 }
850
851 while (!feof(kallsyms)) {
852 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
853 sym_table_count++;
854 assert(sym_table_count <= MAX_SYMS);
855 }
856 }
857
858 sort_symbol_table();
859 min_ip = sym_table[0].addr;
860 max_ip = sym_table[sym_table_count-1].addr;
861 last = sym_table + sym_table_count++;
862
863 last->addr = -1ll;
864 last->sym = "<end>";
865
866 if (filter_end) {
867 int count;
868 for (count=0; count < sym_table_count; count ++) {
869 if (!strcmp(sym_table[count].sym, sym_filter)) {
870 sym_filter_entry = &sym_table[count];
871 break;
872 }
873 }
874 }
875 if (dump_symtab) {
876 int i;
877
878 for (i = 0; i < sym_table_count; i++)
879 fprintf(stderr, "%llx %s\n",
880 sym_table[i].addr, sym_table[i].sym);
881 }
882}
883
884/*
885 * Source lines
886 */
887
888static void parse_vmlinux(char *filename)
889{
890 FILE *file;
891 char command[PATH_MAX*2];
892 if (!filename)
893 return;
894
895 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
896
897 file = popen(command, "r");
898 if (!file)
899 return;
900
901 lines_tail = &lines;
902 while (!feof(file)) {
903 struct source_line *src;
904 size_t dummy = 0;
905 char *c;
906
907 src = malloc(sizeof(struct source_line));
908 assert(src != NULL);
909 memset(src, 0, sizeof(struct source_line));
910
911 if (getline(&src->line, &dummy, file) < 0)
912 break;
913 if (!src->line)
914 break;
915
916 c = strchr(src->line, '\n');
917 if (c)
918 *c = 0;
919
920 src->next = NULL;
921 *lines_tail = src;
922 lines_tail = &src->next;
923
924 if (strlen(src->line)>8 && src->line[8] == ':')
925 src->EIP = strtoull(src->line, NULL, 16);
926 if (strlen(src->line)>8 && src->line[16] == ':')
927 src->EIP = strtoull(src->line, NULL, 16);
928 }
929 pclose(file);
930}
931
932static void record_precise_ip(uint64_t ip)
933{
934 struct source_line *line;
935
936 for (line = lines; line; line = line->next) {
937 if (line->EIP == ip)
938 line->count++;
939 if (line->EIP > ip)
940 break;
941 }
942}
943
944static void lookup_sym_in_vmlinux(struct sym_entry *sym)
945{
946 struct source_line *line;
947 char pattern[PATH_MAX];
948 sprintf(pattern, "<%s>:", sym->sym);
949
950 for (line = lines; line; line = line->next) {
951 if (strstr(line->line, pattern)) {
952 sym->source = line;
953 break;
954 }
955 }
956}
957
958static void show_lines(struct source_line *line_queue, int line_queue_count)
959{
960 int i;
961 struct source_line *line;
962
963 line = line_queue;
964 for (i = 0; i < line_queue_count; i++) {
965 printf("%8li\t%s\n", line->count, line->line);
966 line = line->next;
967 }
968}
969
970#define TRACE_COUNT 3
971
972static void show_details(struct sym_entry *sym)
973{
974 struct source_line *line;
975 struct source_line *line_queue = NULL;
976 int displayed = 0;
977 int line_queue_count = 0;
978
979 if (!sym->source)
980 lookup_sym_in_vmlinux(sym);
981 if (!sym->source)
982 return;
983
984 printf("Showing details for %s\n", sym->sym);
985
986 line = sym->source;
987 while (line) {
988 if (displayed && strstr(line->line, ">:"))
989 break;
990
991 if (!line_queue_count)
992 line_queue = line;
993 line_queue_count ++;
994
995 if (line->count >= count_filter) {
996 show_lines(line_queue, line_queue_count);
997 line_queue_count = 0;
998 line_queue = NULL;
999 } else if (line_queue_count > TRACE_COUNT) {
1000 line_queue = line_queue->next;
1001 line_queue_count --;
1002 }
1003
1004 line->count = 0;
1005 displayed++;
1006 if (displayed > 300)
1007 break;
1008 line = line->next;
1009 }
1010}
1011
1012/*
1013 * Binary search in the histogram table and record the hit:
1014 */
1015static void record_ip(uint64_t ip, int counter)
1016{
1017 int left_idx, middle_idx, right_idx, idx;
1018 unsigned long left, middle, right;
1019
1020 record_precise_ip(ip);
1021
1022 left_idx = 0;
1023 right_idx = sym_table_count-1;
1024 assert(ip <= max_ip && ip >= min_ip);
1025
1026 while (left_idx + 1 < right_idx) {
1027 middle_idx = (left_idx + right_idx) / 2;
1028
1029 left = sym_table[ left_idx].addr;
1030 middle = sym_table[middle_idx].addr;
1031 right = sym_table[ right_idx].addr;
1032
1033 if (!(left <= middle && middle <= right)) {
1034 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1035 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1036 }
1037 assert(left <= middle && middle <= right);
1038 if (!(left <= ip && ip <= right)) {
1039 printf(" left: %016lx\n", left);
1040 printf(" ip: %016lx\n", (unsigned long)ip);
1041 printf("right: %016lx\n", right);
1042 }
1043 assert(left <= ip && ip <= right);
1044 /*
1045 * [ left .... target .... middle .... right ]
1046 * => right := middle
1047 */
1048 if (ip < middle) {
1049 right_idx = middle_idx;
1050 continue;
1051 }
1052 /*
1053 * [ left .... middle ... target ... right ]
1054 * => left := middle
1055 */
1056 left_idx = middle_idx;
1057 }
1058
1059 idx = left_idx;
1060
1061 if (!sym_table[idx].skip)
1062 sym_table[idx].count[counter]++;
1063 else events--;
1064}
1065
1066static void process_event(uint64_t ip, int counter)
1067{
1068 events++;
1069
1070 if (ip < min_ip || ip > max_ip) {
1071 userspace_events++;
1072 return;
1073 }
1074
1075 record_ip(ip, counter);
1076}
1077
1078static void process_options(int argc, char *argv[])
1079{
1080 int error = 0, counter;
1081
1082 if (strstr(argv[0], "perfstat"))
1083 run_perfstat = 1;
1084
1085 for (;;) {
1086 int option_index = 0;
1087 /** Options for getopt */
1088 static struct option long_options[] = {
1089 {"count", required_argument, NULL, 'c'},
1090 {"cpu", required_argument, NULL, 'C'},
1091 {"delay", required_argument, NULL, 'd'},
1092 {"dump_symtab", no_argument, NULL, 'D'},
1093 {"event", required_argument, NULL, 'e'},
1094 {"filter", required_argument, NULL, 'f'},
1095 {"group", required_argument, NULL, 'g'},
1096 {"help", no_argument, NULL, 'h'},
1097 {"nmi", required_argument, NULL, 'n'},
1098 {"mmap_info", no_argument, NULL, 'M'},
1099 {"mmap_pages", required_argument, NULL, 'm'},
1100 {"munmap_info", no_argument, NULL, 'U'},
1101 {"pid", required_argument, NULL, 'p'},
1102 {"realtime", required_argument, NULL, 'r'},
1103 {"scale", no_argument, NULL, 'l'},
1104 {"symbol", required_argument, NULL, 's'},
1105 {"stat", no_argument, NULL, 'S'},
1106 {"vmlinux", required_argument, NULL, 'x'},
1107 {"zero", no_argument, NULL, 'z'},
1108 {NULL, 0, NULL, 0 }
1109 };
1110 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1111 long_options, &option_index);
1112 if (c == -1)
1113 break;
1114
1115 switch (c) {
1116 case 'a': system_wide = 1; break;
1117 case 'c': default_interval = atoi(optarg); break;
1118 case 'C':
1119 /* CPU and PID are mutually exclusive */
1120 if (tid != -1) {
1121 printf("WARNING: CPU switch overriding PID\n");
1122 sleep(1);
1123 tid = -1;
1124 }
1125 profile_cpu = atoi(optarg); break;
1126 case 'd': delay_secs = atoi(optarg); break;
1127 case 'D': dump_symtab = 1; break;
1128
1129 case 'e': error = parse_events(optarg); break;
1130
1131 case 'f': count_filter = atoi(optarg); break;
1132 case 'g': group = atoi(optarg); break;
1133 case 'h': display_help(); break;
1134 case 'l': scale = 1; break;
1135 case 'n': nmi = atoi(optarg); break;
1136 case 'p':
1137 /* CPU and PID are mutually exclusive */
1138 if (profile_cpu != -1) {
1139 printf("WARNING: PID switch overriding CPU\n");
1140 sleep(1);
1141 profile_cpu = -1;
1142 }
1143 tid = atoi(optarg); break;
1144 case 'r': realtime_prio = atoi(optarg); break;
1145 case 's': sym_filter = strdup(optarg); break;
1146 case 'S': run_perfstat = 1; break;
1147 case 'x': vmlinux = strdup(optarg); break;
1148 case 'z': zero = 1; break;
1149 case 'm': mmap_pages = atoi(optarg); break;
1150 case 'M': use_mmap = 1; break;
1151 case 'U': use_munmap = 1; break;
1152 default: error = 1; break;
1153 }
1154 }
1155 if (error)
1156 display_help();
1157
1158 if (!nr_counters) {
1159 if (run_perfstat)
1160 nr_counters = 8;
1161 else {
1162 nr_counters = 1;
1163 event_id[0] = 0;
1164 }
1165 }
1166
1167 for (counter = 0; counter < nr_counters; counter++) {
1168 if (event_count[counter])
1169 continue;
1170
1171 event_count[counter] = default_interval;
1172 }
1173}
1174
1175struct mmap_data {
1176 int counter;
1177 void *base;
1178 unsigned int mask;
1179 unsigned int prev;
1180};
1181
1182static unsigned int mmap_read_head(struct mmap_data *md)
1183{
1184 struct perf_counter_mmap_page *pc = md->base;
1185 int head;
1186
1187 head = pc->data_head;
1188 rmb();
1189
1190 return head;
1191}
1192
1193struct timeval last_read, this_read;
1194
1195static void mmap_read(struct mmap_data *md)
1196{
1197 unsigned int head = mmap_read_head(md);
1198 unsigned int old = md->prev;
1199 unsigned char *data = md->base + page_size;
1200 int diff;
1201
1202 gettimeofday(&this_read, NULL);
1203
1204 /*
1205 * If we're further behind than half the buffer, there's a chance
1206 * the writer will bite our tail and screw up the events under us.
1207 *
1208 * If we somehow ended up ahead of the head, we got messed up.
1209 *
1210 * In either case, truncate and restart at head.
1211 */
1212 diff = head - old;
1213 if (diff > md->mask / 2 || diff < 0) {
1214 struct timeval iv;
1215 unsigned long msecs;
1216
1217 timersub(&this_read, &last_read, &iv);
1218 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1219
1220 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1221 " Last read %lu msecs ago.\n", msecs);
1222
1223 /*
1224 * head points to a known good entry, start there.
1225 */
1226 old = head;
1227 }
1228
1229 last_read = this_read;
1230
1231 for (; old != head;) {
1232 struct ip_event {
1233 struct perf_event_header header;
1234 __u64 ip;
1235 __u32 pid, tid;
1236 };
1237 struct mmap_event {
1238 struct perf_event_header header;
1239 __u32 pid, tid;
1240 __u64 start;
1241 __u64 len;
1242 __u64 pgoff;
1243 char filename[PATH_MAX];
1244 };
1245
1246 typedef union event_union {
1247 struct perf_event_header header;
1248 struct ip_event ip;
1249 struct mmap_event mmap;
1250 } event_t;
1251
1252 event_t *event = (event_t *)&data[old & md->mask];
1253
1254 event_t event_copy;
1255
1256 unsigned int size = event->header.size;
1257
1258 /*
1259 * Event straddles the mmap boundary -- header should always
1260 * be inside due to u64 alignment of output.
1261 */
1262 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1263 unsigned int offset = old;
1264 unsigned int len = min(sizeof(*event), size), cpy;
1265 void *dst = &event_copy;
1266
1267 do {
1268 cpy = min(md->mask + 1 - (offset & md->mask), len);
1269 memcpy(dst, &data[offset & md->mask], cpy);
1270 offset += cpy;
1271 dst += cpy;
1272 len -= cpy;
1273 } while (len);
1274
1275 event = &event_copy;
1276 }
1277
1278 old += size;
1279
1280 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1281 if (event->header.type & PERF_RECORD_IP)
1282 process_event(event->ip.ip, md->counter);
1283 } else {
1284 switch (event->header.type) {
1285 case PERF_EVENT_MMAP:
1286 case PERF_EVENT_MUNMAP:
1287 printf("%s: %Lu %Lu %Lu %s\n",
1288 event->header.type == PERF_EVENT_MMAP
1289 ? "mmap" : "munmap",
1290 event->mmap.start,
1291 event->mmap.len,
1292 event->mmap.pgoff,
1293 event->mmap.filename);
1294 break;
1295 }
1296 }
1297 }
1298
1299 md->prev = old;
1300}
1301
1302int main(int argc, char *argv[])
1303{
1304 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1305 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1306 struct perf_counter_hw_event hw_event;
1307 pthread_t thread;
1308 int i, counter, group_fd, nr_poll = 0;
1309 unsigned int cpu;
1310 int ret;
1311
1312 page_size = sysconf(_SC_PAGE_SIZE);
1313
1314 process_options(argc, argv);
1315
1316 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1317 assert(nr_cpus <= MAX_NR_CPUS);
1318 assert(nr_cpus >= 0);
1319
1320 if (run_perfstat)
1321 return do_perfstat(argc, argv);
1322
1323 if (tid != -1 || profile_cpu != -1)
1324 nr_cpus = 1;
1325
1326 parse_symbols();
1327 if (vmlinux && sym_filter_entry)
1328 parse_vmlinux(vmlinux);
1329
1330 for (i = 0; i < nr_cpus; i++) {
1331 group_fd = -1;
1332 for (counter = 0; counter < nr_counters; counter++) {
1333
1334 cpu = profile_cpu;
1335 if (tid == -1 && profile_cpu == -1)
1336 cpu = i;
1337
1338 memset(&hw_event, 0, sizeof(hw_event));
1339 hw_event.config = event_id[counter];
1340 hw_event.irq_period = event_count[counter];
1341 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1342 hw_event.nmi = nmi;
1343 hw_event.mmap = use_mmap;
1344 hw_event.munmap = use_munmap;
1345
1346 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1347 if (fd[i][counter] < 0) {
1348 int err = errno;
1349 printf("kerneltop error: syscall returned with %d (%s)\n",
1350 fd[i][counter], strerror(err));
1351 if (err == EPERM)
1352 printf("Are you root?\n");
1353 exit(-1);
1354 }
1355 assert(fd[i][counter] >= 0);
1356 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1357
1358 /*
1359 * First counter acts as the group leader:
1360 */
1361 if (group && group_fd == -1)
1362 group_fd = fd[i][counter];
1363
1364 event_array[nr_poll].fd = fd[i][counter];
1365 event_array[nr_poll].events = POLLIN;
1366 nr_poll++;
1367
1368 mmap_array[i][counter].counter = counter;
1369 mmap_array[i][counter].prev = 0;
1370 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1371 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1372 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1373 if (mmap_array[i][counter].base == MAP_FAILED) {
1374 printf("kerneltop error: failed to mmap with %d (%s)\n",
1375 errno, strerror(errno));
1376 exit(-1);
1377 }
1378 }
1379 }
1380
1381 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1382 printf("Could not create display thread.\n");
1383 exit(-1);
1384 }
1385
1386 if (realtime_prio) {
1387 struct sched_param param;
1388
1389 param.sched_priority = realtime_prio;
1390 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1391 printf("Could not set realtime priority.\n");
1392 exit(-1);
1393 }
1394 }
1395
1396 while (1) {
1397 int hits = events;
1398
1399 for (i = 0; i < nr_cpus; i++) {
1400 for (counter = 0; counter < nr_counters; counter++)
1401 mmap_read(&mmap_array[i][counter]);
1402 }
1403
1404 if (hits == events)
1405 ret = poll(event_array, nr_poll, 100);
1406 }
1407
1408 return 0;
1409}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 63f8a892c0df..ff8658f2a2f1 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -249,6 +249,7 @@ static void handle_internal_command(int argc, const char **argv)
249 const char *cmd = argv[0]; 249 const char *cmd = argv[0];
250 static struct cmd_struct commands[] = { 250 static struct cmd_struct commands[] = {
251 { "top", cmd_top, 0 }, 251 { "top", cmd_top, 0 },
252 { "stat", cmd_stat, 0 },
252 }; 253 };
253 int i; 254 int i;
254 static const char ext[] = STRIP_EXTENSION; 255 static const char ext[] = STRIP_EXTENSION;