diff options
author | Paul Mackerras <paulus@samba.org> | 2009-03-22 19:29:36 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-04-06 03:30:31 -0400 |
commit | f66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac (patch) | |
tree | b860b3d957905978e641aee4cb36d1f67cf35351 /Documentation/perf_counter/design.txt | |
parent | 0fd112e41cd6f6d4779cbe327c3632d087e31476 (diff) |
perf_counter: update documentation
Impact: documentation fix
This updates the perfcounter documentation to reflect recent changes.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'Documentation/perf_counter/design.txt')
-rw-r--r-- | Documentation/perf_counter/design.txt | 268 |
1 files changed, 202 insertions, 66 deletions
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt index fddd32189a5..aaf105c02fb 100644 --- a/Documentation/perf_counter/design.txt +++ b/Documentation/perf_counter/design.txt | |||
@@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU. | |||
11 | 11 | ||
12 | The Linux Performance Counter subsystem provides an abstraction of these | 12 | The Linux Performance Counter subsystem provides an abstraction of these |
13 | hardware capabilities. It provides per task and per CPU counters, counter | 13 | hardware capabilities. It provides per task and per CPU counters, counter |
14 | groups, and it provides event capabilities on top of those. | 14 | groups, and it provides event capabilities on top of those. It |
15 | provides "virtual" 64-bit counters, regardless of the width of the | ||
16 | underlying hardware counters. | ||
15 | 17 | ||
16 | Performance counters are accessed via special file descriptors. | 18 | Performance counters are accessed via special file descriptors. |
17 | There's one file descriptor per virtual counter used. | 19 | There's one file descriptor per virtual counter used. |
@@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open() | |||
20 | system call: | 22 | system call: |
21 | 23 | ||
22 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, | 24 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, |
23 | pid_t pid, int cpu, int group_fd); | 25 | pid_t pid, int cpu, int group_fd, |
26 | unsigned long flags); | ||
24 | 27 | ||
25 | The syscall returns the new fd. The fd can be used via the normal | 28 | The syscall returns the new fd. The fd can be used via the normal |
26 | VFS system calls: read() can be used to read the counter, fcntl() | 29 | VFS system calls: read() can be used to read the counter, fcntl() |
@@ -32,90 +35,180 @@ can be poll()ed. | |||
32 | When creating a new counter fd, 'perf_counter_hw_event' is: | 35 | When creating a new counter fd, 'perf_counter_hw_event' is: |
33 | 36 | ||
34 | /* | 37 | /* |
35 | * Hardware event to monitor via a performance monitoring counter: | 38 | * Event to monitor via a performance monitoring counter: |
36 | */ | 39 | */ |
37 | struct perf_counter_hw_event { | 40 | struct perf_counter_hw_event { |
38 | s64 type; | 41 | __u64 event_config; |
39 | 42 | ||
40 | u64 irq_period; | 43 | __u64 irq_period; |
41 | u32 record_type; | 44 | __u64 record_type; |
45 | __u64 read_format; | ||
42 | 46 | ||
43 | u32 disabled : 1, /* off by default */ | 47 | __u64 disabled : 1, /* off by default */ |
44 | nmi : 1, /* NMI sampling */ | 48 | nmi : 1, /* NMI sampling */ |
45 | raw : 1, /* raw event type */ | 49 | inherit : 1, /* children inherit it */ |
46 | __reserved_1 : 29; | 50 | pinned : 1, /* must always be on PMU */ |
51 | exclusive : 1, /* only group on PMU */ | ||
52 | exclude_user : 1, /* don't count user */ | ||
53 | exclude_kernel : 1, /* ditto kernel */ | ||
54 | exclude_hv : 1, /* ditto hypervisor */ | ||
55 | exclude_idle : 1, /* don't count when idle */ | ||
47 | 56 | ||
48 | u64 __reserved_2; | 57 | __reserved_1 : 55; |
58 | |||
59 | __u32 extra_config_len; | ||
60 | |||
61 | __u32 __reserved_4; | ||
62 | __u64 __reserved_2; | ||
63 | __u64 __reserved_3; | ||
49 | }; | 64 | }; |
50 | 65 | ||
66 | The 'event_config' field specifies what the counter should count. It | ||
67 | is divided into 3 bit-fields: | ||
68 | |||
69 | raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 | ||
70 | type: 7 bits (next most significant) 0x7f00_0000_0000_0000 | ||
71 | event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 | ||
72 | |||
73 | If 'raw_type' is 1, then the counter will count a hardware event | ||
74 | specified by the remaining 63 bits of event_config. The encoding is | ||
75 | machine-specific. | ||
76 | |||
77 | If 'raw_type' is 0, then the 'type' field says what kind of counter | ||
78 | this is, with the following encoding: | ||
79 | |||
80 | enum perf_event_types { | ||
81 | PERF_TYPE_HARDWARE = 0, | ||
82 | PERF_TYPE_SOFTWARE = 1, | ||
83 | PERF_TYPE_TRACEPOINT = 2, | ||
84 | }; | ||
85 | |||
86 | A counter of PERF_TYPE_HARDWARE will count the hardware event | ||
87 | specified by 'event_id': | ||
88 | |||
51 | /* | 89 | /* |
52 | * Generalized performance counter event types, used by the hw_event.type | 90 | * Generalized performance counter event types, used by the hw_event.event_id |
53 | * parameter of the sys_perf_counter_open() syscall: | 91 | * parameter of the sys_perf_counter_open() syscall: |
54 | */ | 92 | */ |
55 | enum hw_event_types { | 93 | enum hw_event_ids { |
56 | /* | 94 | /* |
57 | * Common hardware events, generalized by the kernel: | 95 | * Common hardware events, generalized by the kernel: |
58 | */ | 96 | */ |
59 | PERF_COUNT_CYCLES = 0, | 97 | PERF_COUNT_CPU_CYCLES = 0, |
60 | PERF_COUNT_INSTRUCTIONS = 1, | 98 | PERF_COUNT_INSTRUCTIONS = 1, |
61 | PERF_COUNT_CACHE_REFERENCES = 2, | 99 | PERF_COUNT_CACHE_REFERENCES = 2, |
62 | PERF_COUNT_CACHE_MISSES = 3, | 100 | PERF_COUNT_CACHE_MISSES = 3, |
63 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | 101 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, |
64 | PERF_COUNT_BRANCH_MISSES = 5, | 102 | PERF_COUNT_BRANCH_MISSES = 5, |
65 | 103 | PERF_COUNT_BUS_CYCLES = 6, | |
66 | /* | ||
67 | * Special "software" counters provided by the kernel, even if | ||
68 | * the hardware does not support performance counters. These | ||
69 | * counters measure various physical and sw events of the | ||
70 | * kernel (and allow the profiling of them as well): | ||
71 | */ | ||
72 | PERF_COUNT_CPU_CLOCK = -1, | ||
73 | PERF_COUNT_TASK_CLOCK = -2, | ||
74 | /* | ||
75 | * Future software events: | ||
76 | */ | ||
77 | /* PERF_COUNT_PAGE_FAULTS = -3, | ||
78 | PERF_COUNT_CONTEXT_SWITCHES = -4, */ | ||
79 | }; | 104 | }; |
80 | 105 | ||
81 | These are standardized types of events that work uniformly on all CPUs | 106 | These are standardized types of events that work relatively uniformly |
82 | that implements Performance Counters support under Linux. If a CPU is | 107 | on all CPUs that implement Performance Counters support under Linux, |
83 | not able to count branch-misses, then the system call will return | 108 | although there may be variations (e.g., different CPUs might count |
84 | -EINVAL. | 109 | cache references and misses at different levels of the cache hierarchy). |
110 | If a CPU is not able to count the selected event, then the system call | ||
111 | will return -EINVAL. | ||
85 | 112 | ||
86 | More hw_event_types are supported as well, but they are CPU | 113 | More hw_event_types are supported as well, but they are CPU-specific |
87 | specific and are enumerated via /sys on a per CPU basis. Raw hw event | 114 | and accessed as raw events. For example, to count "External bus |
88 | types can be passed in under hw_event.type if hw_event.raw is 1. | 115 | cycles while bus lock signal asserted" events on Intel Core CPUs, pass |
89 | For example, to count "External bus cycles while bus lock signal asserted" | 116 | in a 0x4064 event_id value and set hw_event.raw_type to 1. |
90 | events on Intel Core CPUs, pass in a 0x4064 event type value and set | ||
91 | hw_event.raw to 1. | ||
92 | 117 | ||
93 | 'record_type' is the type of data that a read() will provide for the | 118 | A counter of type PERF_TYPE_SOFTWARE will count one of the available |
94 | counter, and it can be one of: | 119 | software events, selected by 'event_id': |
95 | 120 | ||
96 | /* | 121 | /* |
97 | * IRQ-notification data record type: | 122 | * Special "software" counters provided by the kernel, even if the hardware |
123 | * does not support performance counters. These counters measure various | ||
124 | * physical and sw events of the kernel (and allow the profiling of them as | ||
125 | * well): | ||
98 | */ | 126 | */ |
99 | enum perf_counter_record_type { | 127 | enum sw_event_ids { |
100 | PERF_RECORD_SIMPLE = 0, | 128 | PERF_COUNT_CPU_CLOCK = 0, |
101 | PERF_RECORD_IRQ = 1, | 129 | PERF_COUNT_TASK_CLOCK = 1, |
102 | PERF_RECORD_GROUP = 2, | 130 | PERF_COUNT_PAGE_FAULTS = 2, |
131 | PERF_COUNT_CONTEXT_SWITCHES = 3, | ||
132 | PERF_COUNT_CPU_MIGRATIONS = 4, | ||
133 | PERF_COUNT_PAGE_FAULTS_MIN = 5, | ||
134 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, | ||
103 | }; | 135 | }; |
104 | 136 | ||
105 | a "simple" counter is one that counts hardware events and allows | 137 | Counters come in two flavours: counting counters and sampling |
106 | them to be read out into a u64 count value. (read() returns 8 on | 138 | counters. A "counting" counter is one that is used for counting the |
107 | a successful read of a simple counter.) | 139 | number of events that occur, and is characterised by having |
140 | irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a | ||
141 | counting counter simply returns the current value of the counter as | ||
142 | an 8-byte number. | ||
108 | 143 | ||
109 | An "irq" counter is one that will also provide an IRQ context information: | 144 | A "sampling" counter is one that is set up to generate an interrupt |
110 | the IP of the interrupted context. In this case read() will return | 145 | every N events, where N is given by 'irq_period'. A sampling counter |
111 | the 8-byte counter value, plus the Instruction Pointer address of the | 146 | has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The |
112 | interrupted context. | 147 | record_type controls what data is recorded on each interrupt, and the |
148 | available values are currently: | ||
113 | 149 | ||
114 | The parameter 'hw_event_period' is the number of events before waking up | 150 | /* |
115 | a read() that is blocked on a counter fd. Zero value means a non-blocking | 151 | * IRQ-notification data record type: |
116 | counter. | 152 | */ |
153 | enum perf_counter_record_type { | ||
154 | PERF_RECORD_SIMPLE = 0, | ||
155 | PERF_RECORD_IRQ = 1, | ||
156 | PERF_RECORD_GROUP = 2, | ||
157 | }; | ||
117 | 158 | ||
118 | The 'pid' parameter allows the counter to be specific to a task: | 159 | A record_type value of PERF_RECORD_IRQ will record the instruction |
160 | pointer (IP) at which the interrupt occurred. A record_type value of | ||
161 | PERF_RECORD_GROUP will record the event_config and counter value of | ||
162 | all of the other counters in the group, and should only be used on a | ||
163 | group leader (see below). Currently these two values are mutually | ||
164 | exclusive, but record_type will become a bit-mask in future and | ||
165 | support other values. | ||
166 | |||
167 | A sampling counter has an event queue, into which an event is placed | ||
168 | on each interrupt. A read() on a sampling counter will read the next | ||
169 | event from the event queue. If the queue is empty, the read() will | ||
170 | either block or return an EAGAIN error, depending on whether the fd | ||
171 | has been set to non-blocking mode or not. | ||
172 | |||
173 | The 'disabled' bit specifies whether the counter starts out disabled | ||
174 | or enabled. If it is initially disabled, it can be enabled by ioctl | ||
175 | or prctl (see below). | ||
176 | |||
177 | The 'nmi' bit specifies, for hardware events, whether the counter | ||
178 | should be set up to request non-maskable interrupts (NMIs) or normal | ||
179 | interrupts. This bit is ignored if the user doesn't have | ||
180 | CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't | ||
181 | generate NMIs from hardware counters. | ||
182 | |||
183 | The 'inherit' bit, if set, specifies that this counter should count | ||
184 | events on descendant tasks as well as the task specified. This only | ||
185 | applies to new descendents, not to any existing descendents at the | ||
186 | time the counter is created (nor to any new descendents of existing | ||
187 | descendents). | ||
188 | |||
189 | The 'pinned' bit, if set, specifies that the counter should always be | ||
190 | on the CPU if at all possible. It only applies to hardware counters | ||
191 | and only to group leaders. If a pinned counter cannot be put onto the | ||
192 | CPU (e.g. because there are not enough hardware counters or because of | ||
193 | a conflict with some other event), then the counter goes into an | ||
194 | 'error' state, where reads return end-of-file (i.e. read() returns 0) | ||
195 | until the counter is subsequently enabled or disabled. | ||
196 | |||
197 | The 'exclusive' bit, if set, specifies that when this counter's group | ||
198 | is on the CPU, it should be the only group using the CPU's counters. | ||
199 | In future, this will allow sophisticated monitoring programs to supply | ||
200 | extra configuration information via 'extra_config_len' to exploit | ||
201 | advanced features of the CPU's Performance Monitor Unit (PMU) that are | ||
202 | not otherwise accessible and that might disrupt other hardware | ||
203 | counters. | ||
204 | |||
205 | The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a | ||
206 | way to request that counting of events be restricted to times when the | ||
207 | CPU is in user, kernel and/or hypervisor mode. | ||
208 | |||
209 | |||
210 | The 'pid' parameter to the perf_counter_open() system call allows the | ||
211 | counter to be specific to a task: | ||
119 | 212 | ||
120 | pid == 0: if the pid parameter is zero, the counter is attached to the | 213 | pid == 0: if the pid parameter is zero, the counter is attached to the |
121 | current task. | 214 | current task. |
@@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task: | |||
125 | 218 | ||
126 | pid < 0: all tasks are counted (per cpu counters) | 219 | pid < 0: all tasks are counted (per cpu counters) |
127 | 220 | ||
128 | The 'cpu' parameter allows a counter to be made specific to a full | 221 | The 'cpu' parameter allows a counter to be made specific to a CPU: |
129 | CPU: | ||
130 | 222 | ||
131 | cpu >= 0: the counter is restricted to a specific CPU | 223 | cpu >= 0: the counter is restricted to a specific CPU |
132 | cpu == -1: the counter counts on all CPUs | 224 | cpu == -1: the counter counts on all CPUs |
@@ -141,7 +233,51 @@ their own tasks. | |||
141 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts | 233 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts |
142 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. | 234 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. |
143 | 235 | ||
144 | Group counters are created by passing in a group_fd of another counter. | 236 | The 'flags' parameter is currently unused and must be zero. |
145 | Groups are scheduled at once and can be used with PERF_RECORD_GROUP | 237 | |
146 | to record multi-dimensional timestamps. | 238 | The 'group_fd' parameter allows counter "groups" to be set up. A |
239 | counter group has one counter which is the group "leader". The leader | ||
240 | is created first, with group_fd = -1 in the perf_counter_open call | ||
241 | that creates it. The rest of the group members are created | ||
242 | subsequently, with group_fd giving the fd of the group leader. | ||
243 | (A single counter on its own is created with group_fd = -1 and is | ||
244 | considered to be a group with only 1 member.) | ||
245 | |||
246 | A counter group is scheduled onto the CPU as a unit, that is, it will | ||
247 | only be put onto the CPU if all of the counters in the group can be | ||
248 | put onto the CPU. This means that the values of the member counters | ||
249 | can be meaningfully compared, added, divided (to get ratios), etc., | ||
250 | with each other, since they have counted events for the same set of | ||
251 | executed instructions. | ||
252 | |||
253 | Counters can be enabled and disabled in two ways: via ioctl and via | ||
254 | prctl. When a counter is disabled, it doesn't count or generate | ||
255 | events but does continue to exist and maintain its count value. | ||
256 | |||
257 | An individual counter or counter group can be enabled with | ||
258 | |||
259 | ioctl(fd, PERF_COUNTER_IOC_ENABLE); | ||
260 | |||
261 | or disabled with | ||
262 | |||
263 | ioctl(fd, PERF_COUNTER_IOC_DISABLE); | ||
264 | |||
265 | Enabling or disabling the leader of a group enables or disables the | ||
266 | whole group; that is, while the group leader is disabled, none of the | ||
267 | counters in the group will count. Enabling or disabling a member of a | ||
268 | group other than the leader only affects that counter - disabling an | ||
269 | non-leader stops that counter from counting but doesn't affect any | ||
270 | other counter. | ||
271 | |||
272 | A process can enable or disable all the counter groups that are | ||
273 | attached to it, using prctl: | ||
274 | |||
275 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
276 | |||
277 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | ||
278 | |||
279 | This applies to all counters on the current process, whether created | ||
280 | by this process or by another, and doesn't affect any counters that | ||
281 | this process has created on other processes. It only enables or | ||
282 | disables the group leaders, not any other members in the groups. | ||
147 | 283 | ||