diff options
Diffstat (limited to 'Documentation/perf-counters.txt')
-rw-r--r-- | Documentation/perf-counters.txt | 107 |
1 files changed, 75 insertions, 32 deletions
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt index 19033a0bb526..fddd32189a50 100644 --- a/Documentation/perf-counters.txt +++ b/Documentation/perf-counters.txt | |||
@@ -10,8 +10,8 @@ trigger interrupts when a threshold number of events have passed - and can | |||
10 | thus be used to profile the code that runs on that CPU. | 10 | thus be used to profile the code that runs on that CPU. |
11 | 11 | ||
12 | The Linux Performance Counter subsystem provides an abstraction of these | 12 | The Linux Performance Counter subsystem provides an abstraction of these |
13 | hardware capabilities. It provides per task and per CPU counters, and | 13 | hardware capabilities. It provides per task and per CPU counters, counter |
14 | it provides event capabilities on top of those. | 14 | groups, and it provides event capabilities on top of those. |
15 | 15 | ||
16 | Performance counters are accessed via special file descriptors. | 16 | Performance counters are accessed via special file descriptors. |
17 | There's one file descriptor per virtual counter used. | 17 | There's one file descriptor per virtual counter used. |
@@ -19,12 +19,8 @@ There's one file descriptor per virtual counter used. | |||
19 | The special file descriptor is opened via the perf_counter_open() | 19 | The special file descriptor is opened via the perf_counter_open() |
20 | system call: | 20 | system call: |
21 | 21 | ||
22 | int | 22 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, |
23 | perf_counter_open(u32 hw_event_type, | 23 | pid_t pid, int cpu, int group_fd); |
24 | u32 hw_event_period, | ||
25 | u32 record_type, | ||
26 | pid_t pid, | ||
27 | int cpu); | ||
28 | 24 | ||
29 | The syscall returns the new fd. The fd can be used via the normal | 25 | The syscall returns the new fd. The fd can be used via the normal |
30 | VFS system calls: read() can be used to read the counter, fcntl() | 26 | VFS system calls: read() can be used to read the counter, fcntl() |
@@ -33,39 +29,78 @@ can be used to set the blocking mode, etc. | |||
33 | Multiple counters can be kept open at a time, and the counters | 29 | Multiple counters can be kept open at a time, and the counters |
34 | can be poll()ed. | 30 | can be poll()ed. |
35 | 31 | ||
36 | When creating a new counter fd, 'hw_event_type' is one of: | 32 | When creating a new counter fd, 'perf_counter_hw_event' is: |
37 | 33 | ||
38 | enum hw_event_types { | 34 | /* |
39 | PERF_COUNT_CYCLES, | 35 | * Hardware event to monitor via a performance monitoring counter: |
40 | PERF_COUNT_INSTRUCTIONS, | 36 | */ |
41 | PERF_COUNT_CACHE_REFERENCES, | 37 | struct perf_counter_hw_event { |
42 | PERF_COUNT_CACHE_MISSES, | 38 | s64 type; |
43 | PERF_COUNT_BRANCH_INSTRUCTIONS, | 39 | |
44 | PERF_COUNT_BRANCH_MISSES, | 40 | u64 irq_period; |
45 | }; | 41 | u32 record_type; |
42 | |||
43 | u32 disabled : 1, /* off by default */ | ||
44 | nmi : 1, /* NMI sampling */ | ||
45 | raw : 1, /* raw event type */ | ||
46 | __reserved_1 : 29; | ||
47 | |||
48 | u64 __reserved_2; | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * Generalized performance counter event types, used by the hw_event.type | ||
53 | * parameter of the sys_perf_counter_open() syscall: | ||
54 | */ | ||
55 | enum hw_event_types { | ||
56 | /* | ||
57 | * Common hardware events, generalized by the kernel: | ||
58 | */ | ||
59 | PERF_COUNT_CYCLES = 0, | ||
60 | PERF_COUNT_INSTRUCTIONS = 1, | ||
61 | PERF_COUNT_CACHE_REFERENCES = 2, | ||
62 | PERF_COUNT_CACHE_MISSES = 3, | ||
63 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | ||
64 | PERF_COUNT_BRANCH_MISSES = 5, | ||
65 | |||
66 | /* | ||
67 | * Special "software" counters provided by the kernel, even if | ||
68 | * the hardware does not support performance counters. These | ||
69 | * counters measure various physical and sw events of the | ||
70 | * kernel (and allow the profiling of them as well): | ||
71 | */ | ||
72 | PERF_COUNT_CPU_CLOCK = -1, | ||
73 | PERF_COUNT_TASK_CLOCK = -2, | ||
74 | /* | ||
75 | * Future software events: | ||
76 | */ | ||
77 | /* PERF_COUNT_PAGE_FAULTS = -3, | ||
78 | PERF_COUNT_CONTEXT_SWITCHES = -4, */ | ||
79 | }; | ||
46 | 80 | ||
47 | These are standardized types of events that work uniformly on all CPUs | 81 | These are standardized types of events that work uniformly on all CPUs |
48 | that implements Performance Counters support under Linux. If a CPU is | 82 | that implements Performance Counters support under Linux. If a CPU is |
49 | not able to count branch-misses, then the system call will return | 83 | not able to count branch-misses, then the system call will return |
50 | -EINVAL. | 84 | -EINVAL. |
51 | 85 | ||
52 | [ Note: more hw_event_types are supported as well, but they are CPU | 86 | More hw_event_types are supported as well, but they are CPU |
53 | specific and are enumerated via /sys on a per CPU basis. Raw hw event | 87 | specific and are enumerated via /sys on a per CPU basis. Raw hw event |
54 | types can be passed in as negative numbers. For example, to count | 88 | types can be passed in under hw_event.type if hw_event.raw is 1. |
55 | "External bus cycles while bus lock signal asserted" events on Intel | 89 | For example, to count "External bus cycles while bus lock signal asserted" |
56 | Core CPUs, pass in a -0x4064 event type value. ] | 90 | events on Intel Core CPUs, pass in a 0x4064 event type value and set |
57 | 91 | hw_event.raw to 1. | |
58 | The parameter 'hw_event_period' is the number of events before waking up | ||
59 | a read() that is blocked on a counter fd. Zero value means a non-blocking | ||
60 | counter. | ||
61 | 92 | ||
62 | 'record_type' is the type of data that a read() will provide for the | 93 | 'record_type' is the type of data that a read() will provide for the |
63 | counter, and it can be one of: | 94 | counter, and it can be one of: |
64 | 95 | ||
65 | enum perf_record_type { | 96 | /* |
66 | PERF_RECORD_SIMPLE, | 97 | * IRQ-notification data record type: |
67 | PERF_RECORD_IRQ, | 98 | */ |
68 | }; | 99 | enum perf_counter_record_type { |
100 | PERF_RECORD_SIMPLE = 0, | ||
101 | PERF_RECORD_IRQ = 1, | ||
102 | PERF_RECORD_GROUP = 2, | ||
103 | }; | ||
69 | 104 | ||
70 | a "simple" counter is one that counts hardware events and allows | 105 | a "simple" counter is one that counts hardware events and allows |
71 | them to be read out into a u64 count value. (read() returns 8 on | 106 | them to be read out into a u64 count value. (read() returns 8 on |
@@ -76,6 +111,10 @@ the IP of the interrupted context. In this case read() will return | |||
76 | the 8-byte counter value, plus the Instruction Pointer address of the | 111 | the 8-byte counter value, plus the Instruction Pointer address of the |
77 | interrupted context. | 112 | interrupted context. |
78 | 113 | ||
114 | The parameter 'hw_event_period' is the number of events before waking up | ||
115 | a read() that is blocked on a counter fd. Zero value means a non-blocking | ||
116 | counter. | ||
117 | |||
79 | The 'pid' parameter allows the counter to be specific to a task: | 118 | The 'pid' parameter allows the counter to be specific to a task: |
80 | 119 | ||
81 | pid == 0: if the pid parameter is zero, the counter is attached to the | 120 | pid == 0: if the pid parameter is zero, the counter is attached to the |
@@ -92,7 +131,7 @@ CPU: | |||
92 | cpu >= 0: the counter is restricted to a specific CPU | 131 | cpu >= 0: the counter is restricted to a specific CPU |
93 | cpu == -1: the counter counts on all CPUs | 132 | cpu == -1: the counter counts on all CPUs |
94 | 133 | ||
95 | Note: the combination of 'pid == -1' and 'cpu == -1' is not valid. | 134 | (Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) |
96 | 135 | ||
97 | A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts | 136 | A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts |
98 | events of that task and 'follows' that task to whatever CPU the task | 137 | events of that task and 'follows' that task to whatever CPU the task |
@@ -102,3 +141,7 @@ their own tasks. | |||
102 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts | 141 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts |
103 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. | 142 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. |
104 | 143 | ||
144 | Group counters are created by passing in a group_fd of another counter. | ||
145 | Groups are scheduled at once and can be used with PERF_RECORD_GROUP | ||
146 | to record multi-dimensional timestamps. | ||
147 | |||