diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-05-01 06:23:19 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-05-01 07:23:45 -0400 |
commit | e5791a808ae91a9e7e1b65ea9b8de0f96a043d88 (patch) | |
tree | abb4a533f8025b8b9bcd2b9b1980916c5aaf3549 /Documentation/perf_counter/design.txt | |
parent | 585e3374d9d29376c2c37d821c8b7637dd48ca95 (diff) |
perf_counter: documentation update
Update the documentation to reflect the current state of affairs
[ Impact: documentation update ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.296727903@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'Documentation/perf_counter/design.txt')
-rw-r--r-- | Documentation/perf_counter/design.txt | 274 |
1 files changed, 220 insertions, 54 deletions
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt index aaf105c02fba..9930c4bddc6f 100644 --- a/Documentation/perf_counter/design.txt +++ b/Documentation/perf_counter/design.txt | |||
@@ -34,41 +34,47 @@ can be poll()ed. | |||
34 | 34 | ||
35 | When creating a new counter fd, 'perf_counter_hw_event' is: | 35 | When creating a new counter fd, 'perf_counter_hw_event' is: |
36 | 36 | ||
37 | /* | ||
38 | * Event to monitor via a performance monitoring counter: | ||
39 | */ | ||
40 | struct perf_counter_hw_event { | 37 | struct perf_counter_hw_event { |
41 | __u64 event_config; | 38 | /* |
42 | 39 | * The MSB of the config word signifies if the rest contains cpu | |
43 | __u64 irq_period; | 40 | * specific (raw) counter configuration data, if unset, the next |
44 | __u64 record_type; | 41 | * 7 bits are an event type and the rest of the bits are the event |
45 | __u64 read_format; | 42 | * identifier. |
46 | 43 | */ | |
47 | __u64 disabled : 1, /* off by default */ | 44 | __u64 config; |
48 | nmi : 1, /* NMI sampling */ | 45 | |
49 | inherit : 1, /* children inherit it */ | 46 | __u64 irq_period; |
50 | pinned : 1, /* must always be on PMU */ | 47 | __u32 record_type; |
51 | exclusive : 1, /* only group on PMU */ | 48 | __u32 read_format; |
52 | exclude_user : 1, /* don't count user */ | 49 | |
53 | exclude_kernel : 1, /* ditto kernel */ | 50 | __u64 disabled : 1, /* off by default */ |
54 | exclude_hv : 1, /* ditto hypervisor */ | 51 | nmi : 1, /* NMI sampling */ |
55 | exclude_idle : 1, /* don't count when idle */ | 52 | inherit : 1, /* children inherit it */ |
56 | 53 | pinned : 1, /* must always be on PMU */ | |
57 | __reserved_1 : 55; | 54 | exclusive : 1, /* only group on PMU */ |
58 | 55 | exclude_user : 1, /* don't count user */ | |
59 | __u32 extra_config_len; | 56 | exclude_kernel : 1, /* ditto kernel */ |
60 | 57 | exclude_hv : 1, /* ditto hypervisor */ | |
61 | __u32 __reserved_4; | 58 | exclude_idle : 1, /* don't count when idle */ |
62 | __u64 __reserved_2; | 59 | mmap : 1, /* include mmap data */ |
63 | __u64 __reserved_3; | 60 | munmap : 1, /* include munmap data */ |
61 | comm : 1, /* include comm data */ | ||
62 | |||
63 | __reserved_1 : 52; | ||
64 | |||
65 | __u32 extra_config_len; | ||
66 | __u32 wakeup_events; /* wakeup every n events */ | ||
67 | |||
68 | __u64 __reserved_2; | ||
69 | __u64 __reserved_3; | ||
64 | }; | 70 | }; |
65 | 71 | ||
66 | The 'event_config' field specifies what the counter should count. It | 72 | The 'config' field specifies what the counter should count. It |
67 | is divided into 3 bit-fields: | 73 | is divided into 3 bit-fields: |
68 | 74 | ||
69 | raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 | 75 | raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 |
70 | type: 7 bits (next most significant) 0x7f00_0000_0000_0000 | 76 | type: 7 bits (next most significant) 0x7f00_0000_0000_0000 |
71 | event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 | 77 | event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff |
72 | 78 | ||
73 | If 'raw_type' is 1, then the counter will count a hardware event | 79 | If 'raw_type' is 1, then the counter will count a hardware event |
74 | specified by the remaining 63 bits of event_config. The encoding is | 80 | specified by the remaining 63 bits of event_config. The encoding is |
@@ -134,41 +140,56 @@ enum sw_event_ids { | |||
134 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, | 140 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, |
135 | }; | 141 | }; |
136 | 142 | ||
143 | Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event | ||
144 | tracer is available, and event_id values can be obtained from | ||
145 | /debug/tracing/events/*/*/id | ||
146 | |||
147 | |||
137 | Counters come in two flavours: counting counters and sampling | 148 | Counters come in two flavours: counting counters and sampling |
138 | counters. A "counting" counter is one that is used for counting the | 149 | counters. A "counting" counter is one that is used for counting the |
139 | number of events that occur, and is characterised by having | 150 | number of events that occur, and is characterised by having |
140 | irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a | 151 | irq_period = 0. |
141 | counting counter simply returns the current value of the counter as | 152 | |
142 | an 8-byte number. | 153 | |
154 | A read() on a counter returns the current value of the counter and possible | ||
155 | additional values as specified by 'read_format', each value is a u64 (8 bytes) | ||
156 | in size. | ||
157 | |||
158 | /* | ||
159 | * Bits that can be set in hw_event.read_format to request that | ||
160 | * reads on the counter should return the indicated quantities, | ||
161 | * in increasing order of bit value, after the counter value. | ||
162 | */ | ||
163 | enum perf_counter_read_format { | ||
164 | PERF_FORMAT_TOTAL_TIME_ENABLED = 1, | ||
165 | PERF_FORMAT_TOTAL_TIME_RUNNING = 2, | ||
166 | }; | ||
167 | |||
168 | Using these additional values one can establish the overcommit ratio for a | ||
169 | particular counter allowing one to take the round-robin scheduling effect | ||
170 | into account. | ||
171 | |||
143 | 172 | ||
144 | A "sampling" counter is one that is set up to generate an interrupt | 173 | A "sampling" counter is one that is set up to generate an interrupt |
145 | every N events, where N is given by 'irq_period'. A sampling counter | 174 | every N events, where N is given by 'irq_period'. A sampling counter |
146 | has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The | 175 | has irq_period > 0. The record_type controls what data is recorded on each |
147 | record_type controls what data is recorded on each interrupt, and the | 176 | interrupt: |
148 | available values are currently: | ||
149 | 177 | ||
150 | /* | 178 | /* |
151 | * IRQ-notification data record type: | 179 | * Bits that can be set in hw_event.record_type to request information |
180 | * in the overflow packets. | ||
152 | */ | 181 | */ |
153 | enum perf_counter_record_type { | 182 | enum perf_counter_record_format { |
154 | PERF_RECORD_SIMPLE = 0, | 183 | PERF_RECORD_IP = 1U << 0, |
155 | PERF_RECORD_IRQ = 1, | 184 | PERF_RECORD_TID = 1U << 1, |
156 | PERF_RECORD_GROUP = 2, | 185 | PERF_RECORD_TIME = 1U << 2, |
186 | PERF_RECORD_ADDR = 1U << 3, | ||
187 | PERF_RECORD_GROUP = 1U << 4, | ||
188 | PERF_RECORD_CALLCHAIN = 1U << 5, | ||
157 | }; | 189 | }; |
158 | 190 | ||
159 | A record_type value of PERF_RECORD_IRQ will record the instruction | 191 | Such (and other) events will be recorded in a ring-buffer, which is |
160 | pointer (IP) at which the interrupt occurred. A record_type value of | 192 | available to user-space using mmap() (see below). |
161 | PERF_RECORD_GROUP will record the event_config and counter value of | ||
162 | all of the other counters in the group, and should only be used on a | ||
163 | group leader (see below). Currently these two values are mutually | ||
164 | exclusive, but record_type will become a bit-mask in future and | ||
165 | support other values. | ||
166 | |||
167 | A sampling counter has an event queue, into which an event is placed | ||
168 | on each interrupt. A read() on a sampling counter will read the next | ||
169 | event from the event queue. If the queue is empty, the read() will | ||
170 | either block or return an EAGAIN error, depending on whether the fd | ||
171 | has been set to non-blocking mode or not. | ||
172 | 193 | ||
173 | The 'disabled' bit specifies whether the counter starts out disabled | 194 | The 'disabled' bit specifies whether the counter starts out disabled |
174 | or enabled. If it is initially disabled, it can be enabled by ioctl | 195 | or enabled. If it is initially disabled, it can be enabled by ioctl |
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a | |||
206 | way to request that counting of events be restricted to times when the | 227 | way to request that counting of events be restricted to times when the |
207 | CPU is in user, kernel and/or hypervisor mode. | 228 | CPU is in user, kernel and/or hypervisor mode. |
208 | 229 | ||
230 | The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap | ||
231 | operations, these can be used to relate userspace IP addresses to actual | ||
232 | code, even after the mapping (or even the whole process) is gone, | ||
233 | these events are recorded in the ring-buffer (see below). | ||
234 | |||
235 | The 'comm' bit allows tracking of process comm data on process creation. | ||
236 | This too is recorded in the ring-buffer (see below). | ||
209 | 237 | ||
210 | The 'pid' parameter to the perf_counter_open() system call allows the | 238 | The 'pid' parameter to the perf_counter_open() system call allows the |
211 | counter to be specific to a task: | 239 | counter to be specific to a task: |
@@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc., | |||
250 | with each other, since they have counted events for the same set of | 278 | with each other, since they have counted events for the same set of |
251 | executed instructions. | 279 | executed instructions. |
252 | 280 | ||
281 | |||
282 | Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap | ||
283 | tracking are logged into a ring-buffer. This ring-buffer is created and | ||
284 | accessed through mmap(). | ||
285 | |||
286 | The mmap size should be 1+2^n pages, where the first page is a meta-data page | ||
287 | (struct perf_counter_mmap_page) that contains various bits of information such | ||
288 | as where the ring-buffer head is. | ||
289 | |||
290 | /* | ||
291 | * Structure of the page that can be mapped via mmap | ||
292 | */ | ||
293 | struct perf_counter_mmap_page { | ||
294 | __u32 version; /* version number of this structure */ | ||
295 | __u32 compat_version; /* lowest version this is compat with */ | ||
296 | |||
297 | /* | ||
298 | * Bits needed to read the hw counters in user-space. | ||
299 | * | ||
300 | * u32 seq; | ||
301 | * s64 count; | ||
302 | * | ||
303 | * do { | ||
304 | * seq = pc->lock; | ||
305 | * | ||
306 | * barrier() | ||
307 | * if (pc->index) { | ||
308 | * count = pmc_read(pc->index - 1); | ||
309 | * count += pc->offset; | ||
310 | * } else | ||
311 | * goto regular_read; | ||
312 | * | ||
313 | * barrier(); | ||
314 | * } while (pc->lock != seq); | ||
315 | * | ||
316 | * NOTE: for obvious reason this only works on self-monitoring | ||
317 | * processes. | ||
318 | */ | ||
319 | __u32 lock; /* seqlock for synchronization */ | ||
320 | __u32 index; /* hardware counter identifier */ | ||
321 | __s64 offset; /* add to hardware counter value */ | ||
322 | |||
323 | /* | ||
324 | * Control data for the mmap() data buffer. | ||
325 | * | ||
326 | * User-space reading this value should issue an rmb(), on SMP capable | ||
327 | * platforms, after reading this value -- see perf_counter_wakeup(). | ||
328 | */ | ||
329 | __u32 data_head; /* head in the data section */ | ||
330 | }; | ||
331 | |||
332 | NOTE: the hw-counter userspace bits are arch specific and are currently only | ||
333 | implemented on powerpc. | ||
334 | |||
335 | The following 2^n pages are the ring-buffer which contains events of the form: | ||
336 | |||
337 | #define PERF_EVENT_MISC_KERNEL (1 << 0) | ||
338 | #define PERF_EVENT_MISC_USER (1 << 1) | ||
339 | #define PERF_EVENT_MISC_OVERFLOW (1 << 2) | ||
340 | |||
341 | struct perf_event_header { | ||
342 | __u32 type; | ||
343 | __u16 misc; | ||
344 | __u16 size; | ||
345 | }; | ||
346 | |||
347 | enum perf_event_type { | ||
348 | |||
349 | /* | ||
350 | * The MMAP events record the PROT_EXEC mappings so that we can | ||
351 | * correlate userspace IPs to code. They have the following structure: | ||
352 | * | ||
353 | * struct { | ||
354 | * struct perf_event_header header; | ||
355 | * | ||
356 | * u32 pid, tid; | ||
357 | * u64 addr; | ||
358 | * u64 len; | ||
359 | * u64 pgoff; | ||
360 | * char filename[]; | ||
361 | * }; | ||
362 | */ | ||
363 | PERF_EVENT_MMAP = 1, | ||
364 | PERF_EVENT_MUNMAP = 2, | ||
365 | |||
366 | /* | ||
367 | * struct { | ||
368 | * struct perf_event_header header; | ||
369 | * | ||
370 | * u32 pid, tid; | ||
371 | * char comm[]; | ||
372 | * }; | ||
373 | */ | ||
374 | PERF_EVENT_COMM = 3, | ||
375 | |||
376 | /* | ||
377 | * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field | ||
378 | * will be PERF_RECORD_* | ||
379 | * | ||
380 | * struct { | ||
381 | * struct perf_event_header header; | ||
382 | * | ||
383 | * { u64 ip; } && PERF_RECORD_IP | ||
384 | * { u32 pid, tid; } && PERF_RECORD_TID | ||
385 | * { u64 time; } && PERF_RECORD_TIME | ||
386 | * { u64 addr; } && PERF_RECORD_ADDR | ||
387 | * | ||
388 | * { u64 nr; | ||
389 | * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP | ||
390 | * | ||
391 | * { u16 nr, | ||
392 | * hv, | ||
393 | * kernel, | ||
394 | * user; | ||
395 | * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN | ||
396 | * }; | ||
397 | */ | ||
398 | }; | ||
399 | |||
400 | NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented | ||
401 | on x86. | ||
402 | |||
403 | Notification of new events is possible through poll()/select()/epoll() and | ||
404 | fcntl() managing signals. | ||
405 | |||
406 | Normally a notification is generated for every page filled, however one can | ||
407 | additionally set perf_counter_hw_event.wakeup_events to generate one every | ||
408 | so many counter overflow events. | ||
409 | |||
410 | Future work will include a splice() interface to the ring-buffer. | ||
411 | |||
412 | |||
253 | Counters can be enabled and disabled in two ways: via ioctl and via | 413 | Counters can be enabled and disabled in two ways: via ioctl and via |
254 | prctl. When a counter is disabled, it doesn't count or generate | 414 | prctl. When a counter is disabled, it doesn't count or generate |
255 | events but does continue to exist and maintain its count value. | 415 | events but does continue to exist and maintain its count value. |
@@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an | |||
269 | non-leader stops that counter from counting but doesn't affect any | 429 | non-leader stops that counter from counting but doesn't affect any |
270 | other counter. | 430 | other counter. |
271 | 431 | ||
432 | Additionally, non-inherited overflow counters can use | ||
433 | |||
434 | ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); | ||
435 | |||
436 | to enable a counter for 'nr' events, after which it gets disabled again. | ||
437 | |||
272 | A process can enable or disable all the counter groups that are | 438 | A process can enable or disable all the counter groups that are |
273 | attached to it, using prctl: | 439 | attached to it, using prctl: |
274 | 440 | ||