diff options
Diffstat (limited to 'tools/perf/design.txt')
-rw-r--r-- | tools/perf/design.txt | 442 |
1 files changed, 442 insertions, 0 deletions
diff --git a/tools/perf/design.txt b/tools/perf/design.txt new file mode 100644 index 000000000000..d3250763dc92 --- /dev/null +++ b/tools/perf/design.txt | |||
@@ -0,0 +1,442 @@ | |||
1 | |||
2 | Performance Counters for Linux | ||
3 | ------------------------------ | ||
4 | |||
5 | Performance counters are special hardware registers available on most modern | ||
6 | CPUs. These registers count the number of certain types of hw events: such | ||
7 | as instructions executed, cachemisses suffered, or branches mis-predicted - | ||
8 | without slowing down the kernel or applications. These registers can also | ||
9 | trigger interrupts when a threshold number of events have passed - and can | ||
10 | thus be used to profile the code that runs on that CPU. | ||
11 | |||
12 | The Linux Performance Counter subsystem provides an abstraction of these | ||
13 | hardware capabilities. It provides per task and per CPU counters, counter | ||
14 | groups, and it provides event capabilities on top of those. It | ||
15 | provides "virtual" 64-bit counters, regardless of the width of the | ||
16 | underlying hardware counters. | ||
17 | |||
18 | Performance counters are accessed via special file descriptors. | ||
19 | There's one file descriptor per virtual counter used. | ||
20 | |||
21 | The special file descriptor is opened via the perf_counter_open() | ||
22 | system call: | ||
23 | |||
24 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, | ||
25 | pid_t pid, int cpu, int group_fd, | ||
26 | unsigned long flags); | ||
27 | |||
28 | The syscall returns the new fd. The fd can be used via the normal | ||
29 | VFS system calls: read() can be used to read the counter, fcntl() | ||
30 | can be used to set the blocking mode, etc. | ||
31 | |||
32 | Multiple counters can be kept open at a time, and the counters | ||
33 | can be poll()ed. | ||
34 | |||
35 | When creating a new counter fd, 'perf_counter_hw_event' is: | ||
36 | |||
37 | struct perf_counter_hw_event { | ||
38 | /* | ||
39 | * The MSB of the config word signifies if the rest contains cpu | ||
40 | * specific (raw) counter configuration data, if unset, the next | ||
41 | * 7 bits are an event type and the rest of the bits are the event | ||
42 | * identifier. | ||
43 | */ | ||
44 | __u64 config; | ||
45 | |||
46 | __u64 irq_period; | ||
47 | __u32 record_type; | ||
48 | __u32 read_format; | ||
49 | |||
50 | __u64 disabled : 1, /* off by default */ | ||
51 | inherit : 1, /* children inherit it */ | ||
52 | pinned : 1, /* must always be on PMU */ | ||
53 | exclusive : 1, /* only group on PMU */ | ||
54 | exclude_user : 1, /* don't count user */ | ||
55 | exclude_kernel : 1, /* ditto kernel */ | ||
56 | exclude_hv : 1, /* ditto hypervisor */ | ||
57 | exclude_idle : 1, /* don't count when idle */ | ||
58 | mmap : 1, /* include mmap data */ | ||
59 | munmap : 1, /* include munmap data */ | ||
60 | comm : 1, /* include comm data */ | ||
61 | |||
62 | __reserved_1 : 52; | ||
63 | |||
64 | __u32 extra_config_len; | ||
65 | __u32 wakeup_events; /* wakeup every n events */ | ||
66 | |||
67 | __u64 __reserved_2; | ||
68 | __u64 __reserved_3; | ||
69 | }; | ||
70 | |||
71 | The 'config' field specifies what the counter should count. It | ||
72 | is divided into 3 bit-fields: | ||
73 | |||
74 | raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 | ||
75 | type: 7 bits (next most significant) 0x7f00_0000_0000_0000 | ||
76 | event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff | ||
77 | |||
78 | If 'raw_type' is 1, then the counter will count a hardware event | ||
79 | specified by the remaining 63 bits of event_config. The encoding is | ||
80 | machine-specific. | ||
81 | |||
82 | If 'raw_type' is 0, then the 'type' field says what kind of counter | ||
83 | this is, with the following encoding: | ||
84 | |||
85 | enum perf_event_types { | ||
86 | PERF_TYPE_HARDWARE = 0, | ||
87 | PERF_TYPE_SOFTWARE = 1, | ||
88 | PERF_TYPE_TRACEPOINT = 2, | ||
89 | }; | ||
90 | |||
91 | A counter of PERF_TYPE_HARDWARE will count the hardware event | ||
92 | specified by 'event_id': | ||
93 | |||
94 | /* | ||
95 | * Generalized performance counter event types, used by the hw_event.event_id | ||
96 | * parameter of the sys_perf_counter_open() syscall: | ||
97 | */ | ||
98 | enum hw_event_ids { | ||
99 | /* | ||
100 | * Common hardware events, generalized by the kernel: | ||
101 | */ | ||
102 | PERF_COUNT_CPU_CYCLES = 0, | ||
103 | PERF_COUNT_INSTRUCTIONS = 1, | ||
104 | PERF_COUNT_CACHE_REFERENCES = 2, | ||
105 | PERF_COUNT_CACHE_MISSES = 3, | ||
106 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | ||
107 | PERF_COUNT_BRANCH_MISSES = 5, | ||
108 | PERF_COUNT_BUS_CYCLES = 6, | ||
109 | }; | ||
110 | |||
111 | These are standardized types of events that work relatively uniformly | ||
112 | on all CPUs that implement Performance Counters support under Linux, | ||
113 | although there may be variations (e.g., different CPUs might count | ||
114 | cache references and misses at different levels of the cache hierarchy). | ||
115 | If a CPU is not able to count the selected event, then the system call | ||
116 | will return -EINVAL. | ||
117 | |||
118 | More hw_event_types are supported as well, but they are CPU-specific | ||
119 | and accessed as raw events. For example, to count "External bus | ||
120 | cycles while bus lock signal asserted" events on Intel Core CPUs, pass | ||
121 | in a 0x4064 event_id value and set hw_event.raw_type to 1. | ||
122 | |||
123 | A counter of type PERF_TYPE_SOFTWARE will count one of the available | ||
124 | software events, selected by 'event_id': | ||
125 | |||
126 | /* | ||
127 | * Special "software" counters provided by the kernel, even if the hardware | ||
128 | * does not support performance counters. These counters measure various | ||
129 | * physical and sw events of the kernel (and allow the profiling of them as | ||
130 | * well): | ||
131 | */ | ||
132 | enum sw_event_ids { | ||
133 | PERF_COUNT_CPU_CLOCK = 0, | ||
134 | PERF_COUNT_TASK_CLOCK = 1, | ||
135 | PERF_COUNT_PAGE_FAULTS = 2, | ||
136 | PERF_COUNT_CONTEXT_SWITCHES = 3, | ||
137 | PERF_COUNT_CPU_MIGRATIONS = 4, | ||
138 | PERF_COUNT_PAGE_FAULTS_MIN = 5, | ||
139 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, | ||
140 | }; | ||
141 | |||
142 | Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event | ||
143 | tracer is available, and event_id values can be obtained from | ||
144 | /debug/tracing/events/*/*/id | ||
145 | |||
146 | |||
147 | Counters come in two flavours: counting counters and sampling | ||
148 | counters. A "counting" counter is one that is used for counting the | ||
149 | number of events that occur, and is characterised by having | ||
150 | irq_period = 0. | ||
151 | |||
152 | |||
153 | A read() on a counter returns the current value of the counter and possible | ||
154 | additional values as specified by 'read_format', each value is a u64 (8 bytes) | ||
155 | in size. | ||
156 | |||
157 | /* | ||
158 | * Bits that can be set in hw_event.read_format to request that | ||
159 | * reads on the counter should return the indicated quantities, | ||
160 | * in increasing order of bit value, after the counter value. | ||
161 | */ | ||
162 | enum perf_counter_read_format { | ||
163 | PERF_FORMAT_TOTAL_TIME_ENABLED = 1, | ||
164 | PERF_FORMAT_TOTAL_TIME_RUNNING = 2, | ||
165 | }; | ||
166 | |||
167 | Using these additional values one can establish the overcommit ratio for a | ||
168 | particular counter allowing one to take the round-robin scheduling effect | ||
169 | into account. | ||
170 | |||
171 | |||
172 | A "sampling" counter is one that is set up to generate an interrupt | ||
173 | every N events, where N is given by 'irq_period'. A sampling counter | ||
174 | has irq_period > 0. The record_type controls what data is recorded on each | ||
175 | interrupt: | ||
176 | |||
177 | /* | ||
178 | * Bits that can be set in hw_event.record_type to request information | ||
179 | * in the overflow packets. | ||
180 | */ | ||
181 | enum perf_counter_record_format { | ||
182 | PERF_RECORD_IP = 1U << 0, | ||
183 | PERF_RECORD_TID = 1U << 1, | ||
184 | PERF_RECORD_TIME = 1U << 2, | ||
185 | PERF_RECORD_ADDR = 1U << 3, | ||
186 | PERF_RECORD_GROUP = 1U << 4, | ||
187 | PERF_RECORD_CALLCHAIN = 1U << 5, | ||
188 | }; | ||
189 | |||
190 | Such (and other) events will be recorded in a ring-buffer, which is | ||
191 | available to user-space using mmap() (see below). | ||
192 | |||
193 | The 'disabled' bit specifies whether the counter starts out disabled | ||
194 | or enabled. If it is initially disabled, it can be enabled by ioctl | ||
195 | or prctl (see below). | ||
196 | |||
197 | The 'inherit' bit, if set, specifies that this counter should count | ||
198 | events on descendant tasks as well as the task specified. This only | ||
199 | applies to new descendents, not to any existing descendents at the | ||
200 | time the counter is created (nor to any new descendents of existing | ||
201 | descendents). | ||
202 | |||
203 | The 'pinned' bit, if set, specifies that the counter should always be | ||
204 | on the CPU if at all possible. It only applies to hardware counters | ||
205 | and only to group leaders. If a pinned counter cannot be put onto the | ||
206 | CPU (e.g. because there are not enough hardware counters or because of | ||
207 | a conflict with some other event), then the counter goes into an | ||
208 | 'error' state, where reads return end-of-file (i.e. read() returns 0) | ||
209 | until the counter is subsequently enabled or disabled. | ||
210 | |||
211 | The 'exclusive' bit, if set, specifies that when this counter's group | ||
212 | is on the CPU, it should be the only group using the CPU's counters. | ||
213 | In future, this will allow sophisticated monitoring programs to supply | ||
214 | extra configuration information via 'extra_config_len' to exploit | ||
215 | advanced features of the CPU's Performance Monitor Unit (PMU) that are | ||
216 | not otherwise accessible and that might disrupt other hardware | ||
217 | counters. | ||
218 | |||
219 | The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a | ||
220 | way to request that counting of events be restricted to times when the | ||
221 | CPU is in user, kernel and/or hypervisor mode. | ||
222 | |||
223 | The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap | ||
224 | operations, these can be used to relate userspace IP addresses to actual | ||
225 | code, even after the mapping (or even the whole process) is gone, | ||
226 | these events are recorded in the ring-buffer (see below). | ||
227 | |||
228 | The 'comm' bit allows tracking of process comm data on process creation. | ||
229 | This too is recorded in the ring-buffer (see below). | ||
230 | |||
231 | The 'pid' parameter to the perf_counter_open() system call allows the | ||
232 | counter to be specific to a task: | ||
233 | |||
234 | pid == 0: if the pid parameter is zero, the counter is attached to the | ||
235 | current task. | ||
236 | |||
237 | pid > 0: the counter is attached to a specific task (if the current task | ||
238 | has sufficient privilege to do so) | ||
239 | |||
240 | pid < 0: all tasks are counted (per cpu counters) | ||
241 | |||
242 | The 'cpu' parameter allows a counter to be made specific to a CPU: | ||
243 | |||
244 | cpu >= 0: the counter is restricted to a specific CPU | ||
245 | cpu == -1: the counter counts on all CPUs | ||
246 | |||
247 | (Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) | ||
248 | |||
249 | A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts | ||
250 | events of that task and 'follows' that task to whatever CPU the task | ||
251 | gets schedule to. Per task counters can be created by any user, for | ||
252 | their own tasks. | ||
253 | |||
254 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts | ||
255 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. | ||
256 | |||
257 | The 'flags' parameter is currently unused and must be zero. | ||
258 | |||
259 | The 'group_fd' parameter allows counter "groups" to be set up. A | ||
260 | counter group has one counter which is the group "leader". The leader | ||
261 | is created first, with group_fd = -1 in the perf_counter_open call | ||
262 | that creates it. The rest of the group members are created | ||
263 | subsequently, with group_fd giving the fd of the group leader. | ||
264 | (A single counter on its own is created with group_fd = -1 and is | ||
265 | considered to be a group with only 1 member.) | ||
266 | |||
267 | A counter group is scheduled onto the CPU as a unit, that is, it will | ||
268 | only be put onto the CPU if all of the counters in the group can be | ||
269 | put onto the CPU. This means that the values of the member counters | ||
270 | can be meaningfully compared, added, divided (to get ratios), etc., | ||
271 | with each other, since they have counted events for the same set of | ||
272 | executed instructions. | ||
273 | |||
274 | |||
275 | Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap | ||
276 | tracking are logged into a ring-buffer. This ring-buffer is created and | ||
277 | accessed through mmap(). | ||
278 | |||
279 | The mmap size should be 1+2^n pages, where the first page is a meta-data page | ||
280 | (struct perf_counter_mmap_page) that contains various bits of information such | ||
281 | as where the ring-buffer head is. | ||
282 | |||
283 | /* | ||
284 | * Structure of the page that can be mapped via mmap | ||
285 | */ | ||
286 | struct perf_counter_mmap_page { | ||
287 | __u32 version; /* version number of this structure */ | ||
288 | __u32 compat_version; /* lowest version this is compat with */ | ||
289 | |||
290 | /* | ||
291 | * Bits needed to read the hw counters in user-space. | ||
292 | * | ||
293 | * u32 seq; | ||
294 | * s64 count; | ||
295 | * | ||
296 | * do { | ||
297 | * seq = pc->lock; | ||
298 | * | ||
299 | * barrier() | ||
300 | * if (pc->index) { | ||
301 | * count = pmc_read(pc->index - 1); | ||
302 | * count += pc->offset; | ||
303 | * } else | ||
304 | * goto regular_read; | ||
305 | * | ||
306 | * barrier(); | ||
307 | * } while (pc->lock != seq); | ||
308 | * | ||
309 | * NOTE: for obvious reason this only works on self-monitoring | ||
310 | * processes. | ||
311 | */ | ||
312 | __u32 lock; /* seqlock for synchronization */ | ||
313 | __u32 index; /* hardware counter identifier */ | ||
314 | __s64 offset; /* add to hardware counter value */ | ||
315 | |||
316 | /* | ||
317 | * Control data for the mmap() data buffer. | ||
318 | * | ||
319 | * User-space reading this value should issue an rmb(), on SMP capable | ||
320 | * platforms, after reading this value -- see perf_counter_wakeup(). | ||
321 | */ | ||
322 | __u32 data_head; /* head in the data section */ | ||
323 | }; | ||
324 | |||
325 | NOTE: the hw-counter userspace bits are arch specific and are currently only | ||
326 | implemented on powerpc. | ||
327 | |||
328 | The following 2^n pages are the ring-buffer which contains events of the form: | ||
329 | |||
330 | #define PERF_EVENT_MISC_KERNEL (1 << 0) | ||
331 | #define PERF_EVENT_MISC_USER (1 << 1) | ||
332 | #define PERF_EVENT_MISC_OVERFLOW (1 << 2) | ||
333 | |||
334 | struct perf_event_header { | ||
335 | __u32 type; | ||
336 | __u16 misc; | ||
337 | __u16 size; | ||
338 | }; | ||
339 | |||
340 | enum perf_event_type { | ||
341 | |||
342 | /* | ||
343 | * The MMAP events record the PROT_EXEC mappings so that we can | ||
344 | * correlate userspace IPs to code. They have the following structure: | ||
345 | * | ||
346 | * struct { | ||
347 | * struct perf_event_header header; | ||
348 | * | ||
349 | * u32 pid, tid; | ||
350 | * u64 addr; | ||
351 | * u64 len; | ||
352 | * u64 pgoff; | ||
353 | * char filename[]; | ||
354 | * }; | ||
355 | */ | ||
356 | PERF_EVENT_MMAP = 1, | ||
357 | PERF_EVENT_MUNMAP = 2, | ||
358 | |||
359 | /* | ||
360 | * struct { | ||
361 | * struct perf_event_header header; | ||
362 | * | ||
363 | * u32 pid, tid; | ||
364 | * char comm[]; | ||
365 | * }; | ||
366 | */ | ||
367 | PERF_EVENT_COMM = 3, | ||
368 | |||
369 | /* | ||
370 | * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field | ||
371 | * will be PERF_RECORD_* | ||
372 | * | ||
373 | * struct { | ||
374 | * struct perf_event_header header; | ||
375 | * | ||
376 | * { u64 ip; } && PERF_RECORD_IP | ||
377 | * { u32 pid, tid; } && PERF_RECORD_TID | ||
378 | * { u64 time; } && PERF_RECORD_TIME | ||
379 | * { u64 addr; } && PERF_RECORD_ADDR | ||
380 | * | ||
381 | * { u64 nr; | ||
382 | * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP | ||
383 | * | ||
384 | * { u16 nr, | ||
385 | * hv, | ||
386 | * kernel, | ||
387 | * user; | ||
388 | * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN | ||
389 | * }; | ||
390 | */ | ||
391 | }; | ||
392 | |||
393 | NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented | ||
394 | on x86. | ||
395 | |||
396 | Notification of new events is possible through poll()/select()/epoll() and | ||
397 | fcntl() managing signals. | ||
398 | |||
399 | Normally a notification is generated for every page filled, however one can | ||
400 | additionally set perf_counter_hw_event.wakeup_events to generate one every | ||
401 | so many counter overflow events. | ||
402 | |||
403 | Future work will include a splice() interface to the ring-buffer. | ||
404 | |||
405 | |||
406 | Counters can be enabled and disabled in two ways: via ioctl and via | ||
407 | prctl. When a counter is disabled, it doesn't count or generate | ||
408 | events but does continue to exist and maintain its count value. | ||
409 | |||
410 | An individual counter or counter group can be enabled with | ||
411 | |||
412 | ioctl(fd, PERF_COUNTER_IOC_ENABLE); | ||
413 | |||
414 | or disabled with | ||
415 | |||
416 | ioctl(fd, PERF_COUNTER_IOC_DISABLE); | ||
417 | |||
418 | Enabling or disabling the leader of a group enables or disables the | ||
419 | whole group; that is, while the group leader is disabled, none of the | ||
420 | counters in the group will count. Enabling or disabling a member of a | ||
421 | group other than the leader only affects that counter - disabling an | ||
422 | non-leader stops that counter from counting but doesn't affect any | ||
423 | other counter. | ||
424 | |||
425 | Additionally, non-inherited overflow counters can use | ||
426 | |||
427 | ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); | ||
428 | |||
429 | to enable a counter for 'nr' events, after which it gets disabled again. | ||
430 | |||
431 | A process can enable or disable all the counter groups that are | ||
432 | attached to it, using prctl: | ||
433 | |||
434 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
435 | |||
436 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | ||
437 | |||
438 | This applies to all counters on the current process, whether created | ||
439 | by this process or by another, and doesn't affect any counters that | ||
440 | this process has created on other processes. It only enables or | ||
441 | disables the group leaders, not any other members in the groups. | ||
442 | |||