aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-03-22 19:29:36 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:30:31 -0400
commitf66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac (patch)
treeb860b3d957905978e641aee4cb36d1f67cf35351
parent0fd112e41cd6f6d4779cbe327c3632d087e31476 (diff)
perf_counter: update documentation
Impact: documentation fix This updates the perfcounter documentation to reflect recent changes. Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--Documentation/perf_counter/design.txt268
1 files changed, 202 insertions, 66 deletions
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index fddd32189a50..aaf105c02fba 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU.
11 11
12The Linux Performance Counter subsystem provides an abstraction of these 12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter 13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. 14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
15 17
16Performance counters are accessed via special file descriptors. 18Performance counters are accessed via special file descriptors.
17There's one file descriptor per virtual counter used. 19There's one file descriptor per virtual counter used.
@@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open()
20system call: 22system call:
21 23
22 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, 24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
23 pid_t pid, int cpu, int group_fd); 25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
24 27
25The syscall returns the new fd. The fd can be used via the normal 28The syscall returns the new fd. The fd can be used via the normal
26VFS system calls: read() can be used to read the counter, fcntl() 29VFS system calls: read() can be used to read the counter, fcntl()
@@ -32,90 +35,180 @@ can be poll()ed.
32When creating a new counter fd, 'perf_counter_hw_event' is: 35When creating a new counter fd, 'perf_counter_hw_event' is:
33 36
34/* 37/*
35 * Hardware event to monitor via a performance monitoring counter: 38 * Event to monitor via a performance monitoring counter:
36 */ 39 */
37struct perf_counter_hw_event { 40struct perf_counter_hw_event {
38 s64 type; 41 __u64 event_config;
39 42
40 u64 irq_period; 43 __u64 irq_period;
41 u32 record_type; 44 __u64 record_type;
45 __u64 read_format;
42 46
43 u32 disabled : 1, /* off by default */ 47 __u64 disabled : 1, /* off by default */
44 nmi : 1, /* NMI sampling */ 48 nmi : 1, /* NMI sampling */
45 raw : 1, /* raw event type */ 49 inherit : 1, /* children inherit it */
46 __reserved_1 : 29; 50 pinned : 1, /* must always be on PMU */
51 exclusive : 1, /* only group on PMU */
52 exclude_user : 1, /* don't count user */
53 exclude_kernel : 1, /* ditto kernel */
54 exclude_hv : 1, /* ditto hypervisor */
55 exclude_idle : 1, /* don't count when idle */
47 56
48 u64 __reserved_2; 57 __reserved_1 : 55;
58
59 __u32 extra_config_len;
60
61 __u32 __reserved_4;
62 __u64 __reserved_2;
63 __u64 __reserved_3;
49}; 64};
50 65
66The 'event_config' field specifies what the counter should count. It
67is divided into 3 bit-fields:
68
69raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
70type: 7 bits (next most significant) 0x7f00_0000_0000_0000
71event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
72
73If 'raw_type' is 1, then the counter will count a hardware event
74specified by the remaining 63 bits of event_config. The encoding is
75machine-specific.
76
77If 'raw_type' is 0, then the 'type' field says what kind of counter
78this is, with the following encoding:
79
80enum perf_event_types {
81 PERF_TYPE_HARDWARE = 0,
82 PERF_TYPE_SOFTWARE = 1,
83 PERF_TYPE_TRACEPOINT = 2,
84};
85
86A counter of PERF_TYPE_HARDWARE will count the hardware event
87specified by 'event_id':
88
51/* 89/*
52 * Generalized performance counter event types, used by the hw_event.type 90 * Generalized performance counter event types, used by the hw_event.event_id
53 * parameter of the sys_perf_counter_open() syscall: 91 * parameter of the sys_perf_counter_open() syscall:
54 */ 92 */
55enum hw_event_types { 93enum hw_event_ids {
56 /* 94 /*
57 * Common hardware events, generalized by the kernel: 95 * Common hardware events, generalized by the kernel:
58 */ 96 */
59 PERF_COUNT_CYCLES = 0, 97 PERF_COUNT_CPU_CYCLES = 0,
60 PERF_COUNT_INSTRUCTIONS = 1, 98 PERF_COUNT_INSTRUCTIONS = 1,
61 PERF_COUNT_CACHE_REFERENCES = 2, 99 PERF_COUNT_CACHE_REFERENCES = 2,
62 PERF_COUNT_CACHE_MISSES = 3, 100 PERF_COUNT_CACHE_MISSES = 3,
63 PERF_COUNT_BRANCH_INSTRUCTIONS = 4, 101 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
64 PERF_COUNT_BRANCH_MISSES = 5, 102 PERF_COUNT_BRANCH_MISSES = 5,
65 103 PERF_COUNT_BUS_CYCLES = 6,
66 /*
67 * Special "software" counters provided by the kernel, even if
68 * the hardware does not support performance counters. These
69 * counters measure various physical and sw events of the
70 * kernel (and allow the profiling of them as well):
71 */
72 PERF_COUNT_CPU_CLOCK = -1,
73 PERF_COUNT_TASK_CLOCK = -2,
74 /*
75 * Future software events:
76 */
77 /* PERF_COUNT_PAGE_FAULTS = -3,
78 PERF_COUNT_CONTEXT_SWITCHES = -4, */
79}; 104};
80 105
81These are standardized types of events that work uniformly on all CPUs 106These are standardized types of events that work relatively uniformly
82that implements Performance Counters support under Linux. If a CPU is 107on all CPUs that implement Performance Counters support under Linux,
83not able to count branch-misses, then the system call will return 108although there may be variations (e.g., different CPUs might count
84-EINVAL. 109cache references and misses at different levels of the cache hierarchy).
110If a CPU is not able to count the selected event, then the system call
111will return -EINVAL.
85 112
86More hw_event_types are supported as well, but they are CPU 113More hw_event_types are supported as well, but they are CPU-specific
87specific and are enumerated via /sys on a per CPU basis. Raw hw event 114and accessed as raw events. For example, to count "External bus
88types can be passed in under hw_event.type if hw_event.raw is 1. 115cycles while bus lock signal asserted" events on Intel Core CPUs, pass
89For example, to count "External bus cycles while bus lock signal asserted" 116in a 0x4064 event_id value and set hw_event.raw_type to 1.
90events on Intel Core CPUs, pass in a 0x4064 event type value and set
91hw_event.raw to 1.
92 117
93'record_type' is the type of data that a read() will provide for the 118A counter of type PERF_TYPE_SOFTWARE will count one of the available
94counter, and it can be one of: 119software events, selected by 'event_id':
95 120
96/* 121/*
97 * IRQ-notification data record type: 122 * Special "software" counters provided by the kernel, even if the hardware
123 * does not support performance counters. These counters measure various
124 * physical and sw events of the kernel (and allow the profiling of them as
125 * well):
98 */ 126 */
99enum perf_counter_record_type { 127enum sw_event_ids {
100 PERF_RECORD_SIMPLE = 0, 128 PERF_COUNT_CPU_CLOCK = 0,
101 PERF_RECORD_IRQ = 1, 129 PERF_COUNT_TASK_CLOCK = 1,
102 PERF_RECORD_GROUP = 2, 130 PERF_COUNT_PAGE_FAULTS = 2,
131 PERF_COUNT_CONTEXT_SWITCHES = 3,
132 PERF_COUNT_CPU_MIGRATIONS = 4,
133 PERF_COUNT_PAGE_FAULTS_MIN = 5,
134 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
103}; 135};
104 136
105a "simple" counter is one that counts hardware events and allows 137Counters come in two flavours: counting counters and sampling
106them to be read out into a u64 count value. (read() returns 8 on 138counters. A "counting" counter is one that is used for counting the
107a successful read of a simple counter.) 139number of events that occur, and is characterised by having
140irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
141counting counter simply returns the current value of the counter as
142an 8-byte number.
108 143
109An "irq" counter is one that will also provide an IRQ context information: 144A "sampling" counter is one that is set up to generate an interrupt
110the IP of the interrupted context. In this case read() will return 145every N events, where N is given by 'irq_period'. A sampling counter
111the 8-byte counter value, plus the Instruction Pointer address of the 146has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
112interrupted context. 147record_type controls what data is recorded on each interrupt, and the
148available values are currently:
113 149
114The parameter 'hw_event_period' is the number of events before waking up 150/*
115a read() that is blocked on a counter fd. Zero value means a non-blocking 151 * IRQ-notification data record type:
116counter. 152 */
153enum perf_counter_record_type {
154 PERF_RECORD_SIMPLE = 0,
155 PERF_RECORD_IRQ = 1,
156 PERF_RECORD_GROUP = 2,
157};
117 158
118The 'pid' parameter allows the counter to be specific to a task: 159A record_type value of PERF_RECORD_IRQ will record the instruction
160pointer (IP) at which the interrupt occurred. A record_type value of
161PERF_RECORD_GROUP will record the event_config and counter value of
162all of the other counters in the group, and should only be used on a
163group leader (see below). Currently these two values are mutually
164exclusive, but record_type will become a bit-mask in future and
165support other values.
166
167A sampling counter has an event queue, into which an event is placed
168on each interrupt. A read() on a sampling counter will read the next
169event from the event queue. If the queue is empty, the read() will
170either block or return an EAGAIN error, depending on whether the fd
171has been set to non-blocking mode or not.
172
173The 'disabled' bit specifies whether the counter starts out disabled
174or enabled. If it is initially disabled, it can be enabled by ioctl
175or prctl (see below).
176
177The 'nmi' bit specifies, for hardware events, whether the counter
178should be set up to request non-maskable interrupts (NMIs) or normal
179interrupts. This bit is ignored if the user doesn't have
180CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
181generate NMIs from hardware counters.
182
183The 'inherit' bit, if set, specifies that this counter should count
184events on descendant tasks as well as the task specified. This only
185applies to new descendents, not to any existing descendents at the
186time the counter is created (nor to any new descendents of existing
187descendents).
188
189The 'pinned' bit, if set, specifies that the counter should always be
190on the CPU if at all possible. It only applies to hardware counters
191and only to group leaders. If a pinned counter cannot be put onto the
192CPU (e.g. because there are not enough hardware counters or because of
193a conflict with some other event), then the counter goes into an
194'error' state, where reads return end-of-file (i.e. read() returns 0)
195until the counter is subsequently enabled or disabled.
196
197The 'exclusive' bit, if set, specifies that when this counter's group
198is on the CPU, it should be the only group using the CPU's counters.
199In future, this will allow sophisticated monitoring programs to supply
200extra configuration information via 'extra_config_len' to exploit
201advanced features of the CPU's Performance Monitor Unit (PMU) that are
202not otherwise accessible and that might disrupt other hardware
203counters.
204
205The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
206way to request that counting of events be restricted to times when the
207CPU is in user, kernel and/or hypervisor mode.
208
209
210The 'pid' parameter to the perf_counter_open() system call allows the
211counter to be specific to a task:
119 212
120 pid == 0: if the pid parameter is zero, the counter is attached to the 213 pid == 0: if the pid parameter is zero, the counter is attached to the
121 current task. 214 current task.
@@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task:
125 218
126 pid < 0: all tasks are counted (per cpu counters) 219 pid < 0: all tasks are counted (per cpu counters)
127 220
128The 'cpu' parameter allows a counter to be made specific to a full 221The 'cpu' parameter allows a counter to be made specific to a CPU:
129CPU:
130 222
131 cpu >= 0: the counter is restricted to a specific CPU 223 cpu >= 0: the counter is restricted to a specific CPU
132 cpu == -1: the counter counts on all CPUs 224 cpu == -1: the counter counts on all CPUs
@@ -141,7 +233,51 @@ their own tasks.
141A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts 233A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
142all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. 234all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
143 235
144Group counters are created by passing in a group_fd of another counter. 236The 'flags' parameter is currently unused and must be zero.
145Groups are scheduled at once and can be used with PERF_RECORD_GROUP 237
146to record multi-dimensional timestamps. 238The 'group_fd' parameter allows counter "groups" to be set up. A
239counter group has one counter which is the group "leader". The leader
240is created first, with group_fd = -1 in the perf_counter_open call
241that creates it. The rest of the group members are created
242subsequently, with group_fd giving the fd of the group leader.
243(A single counter on its own is created with group_fd = -1 and is
244considered to be a group with only 1 member.)
245
246A counter group is scheduled onto the CPU as a unit, that is, it will
247only be put onto the CPU if all of the counters in the group can be
248put onto the CPU. This means that the values of the member counters
249can be meaningfully compared, added, divided (to get ratios), etc.,
250with each other, since they have counted events for the same set of
251executed instructions.
252
253Counters can be enabled and disabled in two ways: via ioctl and via
254prctl. When a counter is disabled, it doesn't count or generate
255events but does continue to exist and maintain its count value.
256
257An individual counter or counter group can be enabled with
258
259 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
260
261or disabled with
262
263 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
264
265Enabling or disabling the leader of a group enables or disables the
266whole group; that is, while the group leader is disabled, none of the
267counters in the group will count. Enabling or disabling a member of a
268group other than the leader only affects that counter - disabling an
269non-leader stops that counter from counting but doesn't affect any
270other counter.
271
272A process can enable or disable all the counter groups that are
273attached to it, using prctl:
274
275 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
276
277 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
278
279This applies to all counters on the current process, whether created
280by this process or by another, and doesn't affect any counters that
281this process has created on other processes. It only enables or
282disables the group leaders, not any other members in the groups.
147 283