aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/perf_counter/design.txt
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/perf_counter/design.txt')
-rw-r--r--Documentation/perf_counter/design.txt274
1 files changed, 220 insertions, 54 deletions
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index aaf105c02fba..9930c4bddc6f 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -34,41 +34,47 @@ can be poll()ed.
34 34
35When creating a new counter fd, 'perf_counter_hw_event' is: 35When creating a new counter fd, 'perf_counter_hw_event' is:
36 36
37/*
38 * Event to monitor via a performance monitoring counter:
39 */
40struct perf_counter_hw_event { 37struct perf_counter_hw_event {
41 __u64 event_config; 38 /*
42 39 * The MSB of the config word signifies if the rest contains cpu
43 __u64 irq_period; 40 * specific (raw) counter configuration data, if unset, the next
44 __u64 record_type; 41 * 7 bits are an event type and the rest of the bits are the event
45 __u64 read_format; 42 * identifier.
46 43 */
47 __u64 disabled : 1, /* off by default */ 44 __u64 config;
48 nmi : 1, /* NMI sampling */ 45
49 inherit : 1, /* children inherit it */ 46 __u64 irq_period;
50 pinned : 1, /* must always be on PMU */ 47 __u32 record_type;
51 exclusive : 1, /* only group on PMU */ 48 __u32 read_format;
52 exclude_user : 1, /* don't count user */ 49
53 exclude_kernel : 1, /* ditto kernel */ 50 __u64 disabled : 1, /* off by default */
54 exclude_hv : 1, /* ditto hypervisor */ 51 nmi : 1, /* NMI sampling */
55 exclude_idle : 1, /* don't count when idle */ 52 inherit : 1, /* children inherit it */
56 53 pinned : 1, /* must always be on PMU */
57 __reserved_1 : 55; 54 exclusive : 1, /* only group on PMU */
58 55 exclude_user : 1, /* don't count user */
59 __u32 extra_config_len; 56 exclude_kernel : 1, /* ditto kernel */
60 57 exclude_hv : 1, /* ditto hypervisor */
61 __u32 __reserved_4; 58 exclude_idle : 1, /* don't count when idle */
62 __u64 __reserved_2; 59 mmap : 1, /* include mmap data */
63 __u64 __reserved_3; 60 munmap : 1, /* include munmap data */
61 comm : 1, /* include comm data */
62
63 __reserved_1 : 52;
64
65 __u32 extra_config_len;
66 __u32 wakeup_events; /* wakeup every n events */
67
68 __u64 __reserved_2;
69 __u64 __reserved_3;
64}; 70};
65 71
66The 'event_config' field specifies what the counter should count. It 72The 'config' field specifies what the counter should count. It
67is divided into 3 bit-fields: 73is divided into 3 bit-fields:
68 74
69raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 75raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
70type: 7 bits (next most significant) 0x7f00_0000_0000_0000 76type: 7 bits (next most significant) 0x7f00_0000_0000_0000
71event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 77event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
72 78
73If 'raw_type' is 1, then the counter will count a hardware event 79If 'raw_type' is 1, then the counter will count a hardware event
74specified by the remaining 63 bits of event_config. The encoding is 80specified by the remaining 63 bits of event_config. The encoding is
@@ -134,41 +140,56 @@ enum sw_event_ids {
134 PERF_COUNT_PAGE_FAULTS_MAJ = 6, 140 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
135}; 141};
136 142
143Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
144tracer is available, and event_id values can be obtained from
145/debug/tracing/events/*/*/id
146
147
137Counters come in two flavours: counting counters and sampling 148Counters come in two flavours: counting counters and sampling
138counters. A "counting" counter is one that is used for counting the 149counters. A "counting" counter is one that is used for counting the
139number of events that occur, and is characterised by having 150number of events that occur, and is characterised by having
140irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a 151irq_period = 0.
141counting counter simply returns the current value of the counter as 152
142an 8-byte number. 153
154A read() on a counter returns the current value of the counter and possible
155additional values as specified by 'read_format', each value is a u64 (8 bytes)
156in size.
157
158/*
159 * Bits that can be set in hw_event.read_format to request that
160 * reads on the counter should return the indicated quantities,
161 * in increasing order of bit value, after the counter value.
162 */
163enum perf_counter_read_format {
164 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
165 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
166};
167
168Using these additional values one can establish the overcommit ratio for a
169particular counter allowing one to take the round-robin scheduling effect
170into account.
171
143 172
144A "sampling" counter is one that is set up to generate an interrupt 173A "sampling" counter is one that is set up to generate an interrupt
145every N events, where N is given by 'irq_period'. A sampling counter 174every N events, where N is given by 'irq_period'. A sampling counter
146has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The 175has irq_period > 0. The record_type controls what data is recorded on each
147record_type controls what data is recorded on each interrupt, and the 176interrupt:
148available values are currently:
149 177
150/* 178/*
151 * IRQ-notification data record type: 179 * Bits that can be set in hw_event.record_type to request information
180 * in the overflow packets.
152 */ 181 */
153enum perf_counter_record_type { 182enum perf_counter_record_format {
154 PERF_RECORD_SIMPLE = 0, 183 PERF_RECORD_IP = 1U << 0,
155 PERF_RECORD_IRQ = 1, 184 PERF_RECORD_TID = 1U << 1,
156 PERF_RECORD_GROUP = 2, 185 PERF_RECORD_TIME = 1U << 2,
186 PERF_RECORD_ADDR = 1U << 3,
187 PERF_RECORD_GROUP = 1U << 4,
188 PERF_RECORD_CALLCHAIN = 1U << 5,
157}; 189};
158 190
159A record_type value of PERF_RECORD_IRQ will record the instruction 191Such (and other) events will be recorded in a ring-buffer, which is
160pointer (IP) at which the interrupt occurred. A record_type value of 192available to user-space using mmap() (see below).
161PERF_RECORD_GROUP will record the event_config and counter value of
162all of the other counters in the group, and should only be used on a
163group leader (see below). Currently these two values are mutually
164exclusive, but record_type will become a bit-mask in future and
165support other values.
166
167A sampling counter has an event queue, into which an event is placed
168on each interrupt. A read() on a sampling counter will read the next
169event from the event queue. If the queue is empty, the read() will
170either block or return an EAGAIN error, depending on whether the fd
171has been set to non-blocking mode or not.
172 193
173The 'disabled' bit specifies whether the counter starts out disabled 194The 'disabled' bit specifies whether the counter starts out disabled
174or enabled. If it is initially disabled, it can be enabled by ioctl 195or enabled. If it is initially disabled, it can be enabled by ioctl
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
206way to request that counting of events be restricted to times when the 227way to request that counting of events be restricted to times when the
207CPU is in user, kernel and/or hypervisor mode. 228CPU is in user, kernel and/or hypervisor mode.
208 229
230The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
231operations, these can be used to relate userspace IP addresses to actual
232code, even after the mapping (or even the whole process) is gone,
233these events are recorded in the ring-buffer (see below).
234
235The 'comm' bit allows tracking of process comm data on process creation.
236This too is recorded in the ring-buffer (see below).
209 237
210The 'pid' parameter to the perf_counter_open() system call allows the 238The 'pid' parameter to the perf_counter_open() system call allows the
211counter to be specific to a task: 239counter to be specific to a task:
@@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc.,
250with each other, since they have counted events for the same set of 278with each other, since they have counted events for the same set of
251executed instructions. 279executed instructions.
252 280
281
282Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
283tracking are logged into a ring-buffer. This ring-buffer is created and
284accessed through mmap().
285
286The mmap size should be 1+2^n pages, where the first page is a meta-data page
287(struct perf_counter_mmap_page) that contains various bits of information such
288as where the ring-buffer head is.
289
290/*
291 * Structure of the page that can be mapped via mmap
292 */
293struct perf_counter_mmap_page {
294 __u32 version; /* version number of this structure */
295 __u32 compat_version; /* lowest version this is compat with */
296
297 /*
298 * Bits needed to read the hw counters in user-space.
299 *
300 * u32 seq;
301 * s64 count;
302 *
303 * do {
304 * seq = pc->lock;
305 *
306 * barrier()
307 * if (pc->index) {
308 * count = pmc_read(pc->index - 1);
309 * count += pc->offset;
310 * } else
311 * goto regular_read;
312 *
313 * barrier();
314 * } while (pc->lock != seq);
315 *
316 * NOTE: for obvious reason this only works on self-monitoring
317 * processes.
318 */
319 __u32 lock; /* seqlock for synchronization */
320 __u32 index; /* hardware counter identifier */
321 __s64 offset; /* add to hardware counter value */
322
323 /*
324 * Control data for the mmap() data buffer.
325 *
326 * User-space reading this value should issue an rmb(), on SMP capable
327 * platforms, after reading this value -- see perf_counter_wakeup().
328 */
329 __u32 data_head; /* head in the data section */
330};
331
332NOTE: the hw-counter userspace bits are arch specific and are currently only
333 implemented on powerpc.
334
335The following 2^n pages are the ring-buffer which contains events of the form:
336
337#define PERF_EVENT_MISC_KERNEL (1 << 0)
338#define PERF_EVENT_MISC_USER (1 << 1)
339#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
340
341struct perf_event_header {
342 __u32 type;
343 __u16 misc;
344 __u16 size;
345};
346
347enum perf_event_type {
348
349 /*
350 * The MMAP events record the PROT_EXEC mappings so that we can
351 * correlate userspace IPs to code. They have the following structure:
352 *
353 * struct {
354 * struct perf_event_header header;
355 *
356 * u32 pid, tid;
357 * u64 addr;
358 * u64 len;
359 * u64 pgoff;
360 * char filename[];
361 * };
362 */
363 PERF_EVENT_MMAP = 1,
364 PERF_EVENT_MUNMAP = 2,
365
366 /*
367 * struct {
368 * struct perf_event_header header;
369 *
370 * u32 pid, tid;
371 * char comm[];
372 * };
373 */
374 PERF_EVENT_COMM = 3,
375
376 /*
377 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
378 * will be PERF_RECORD_*
379 *
380 * struct {
381 * struct perf_event_header header;
382 *
383 * { u64 ip; } && PERF_RECORD_IP
384 * { u32 pid, tid; } && PERF_RECORD_TID
385 * { u64 time; } && PERF_RECORD_TIME
386 * { u64 addr; } && PERF_RECORD_ADDR
387 *
388 * { u64 nr;
389 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
390 *
391 * { u16 nr,
392 * hv,
393 * kernel,
394 * user;
395 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
396 * };
397 */
398};
399
400NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
401 on x86.
402
403Notification of new events is possible through poll()/select()/epoll() and
404fcntl() managing signals.
405
406Normally a notification is generated for every page filled, however one can
407additionally set perf_counter_hw_event.wakeup_events to generate one every
408so many counter overflow events.
409
410Future work will include a splice() interface to the ring-buffer.
411
412
253Counters can be enabled and disabled in two ways: via ioctl and via 413Counters can be enabled and disabled in two ways: via ioctl and via
254prctl. When a counter is disabled, it doesn't count or generate 414prctl. When a counter is disabled, it doesn't count or generate
255events but does continue to exist and maintain its count value. 415events but does continue to exist and maintain its count value.
@@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an
269non-leader stops that counter from counting but doesn't affect any 429non-leader stops that counter from counting but doesn't affect any
270other counter. 430other counter.
271 431
432Additionally, non-inherited overflow counters can use
433
434 ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
435
436to enable a counter for 'nr' events, after which it gets disabled again.
437
272A process can enable or disable all the counter groups that are 438A process can enable or disable all the counter groups that are
273attached to it, using prctl: 439attached to it, using prctl:
274 440