diff options
68 files changed, 10460 insertions, 93 deletions
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile new file mode 100644 index 000000000000..194b66215588 --- /dev/null +++ b/Documentation/perf_counter/Makefile | |||
@@ -0,0 +1,12 @@ | |||
1 | BINS = kerneltop perfstat | ||
2 | |||
3 | all: $(BINS) | ||
4 | |||
5 | kerneltop: kerneltop.c ../../include/linux/perf_counter.h | ||
6 | cc -O6 -Wall -lrt -o $@ $< | ||
7 | |||
8 | perfstat: kerneltop | ||
9 | ln -sf kerneltop perfstat | ||
10 | |||
11 | clean: | ||
12 | rm $(BINS) | ||
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt new file mode 100644 index 000000000000..aaf105c02fba --- /dev/null +++ b/Documentation/perf_counter/design.txt | |||
@@ -0,0 +1,283 @@ | |||
1 | |||
2 | Performance Counters for Linux | ||
3 | ------------------------------ | ||
4 | |||
5 | Performance counters are special hardware registers available on most modern | ||
6 | CPUs. These registers count the number of certain types of hw events: such | ||
7 | as instructions executed, cachemisses suffered, or branches mis-predicted - | ||
8 | without slowing down the kernel or applications. These registers can also | ||
9 | trigger interrupts when a threshold number of events have passed - and can | ||
10 | thus be used to profile the code that runs on that CPU. | ||
11 | |||
12 | The Linux Performance Counter subsystem provides an abstraction of these | ||
13 | hardware capabilities. It provides per task and per CPU counters, counter | ||
14 | groups, and it provides event capabilities on top of those. It | ||
15 | provides "virtual" 64-bit counters, regardless of the width of the | ||
16 | underlying hardware counters. | ||
17 | |||
18 | Performance counters are accessed via special file descriptors. | ||
19 | There's one file descriptor per virtual counter used. | ||
20 | |||
21 | The special file descriptor is opened via the perf_counter_open() | ||
22 | system call: | ||
23 | |||
24 | int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, | ||
25 | pid_t pid, int cpu, int group_fd, | ||
26 | unsigned long flags); | ||
27 | |||
28 | The syscall returns the new fd. The fd can be used via the normal | ||
29 | VFS system calls: read() can be used to read the counter, fcntl() | ||
30 | can be used to set the blocking mode, etc. | ||
31 | |||
32 | Multiple counters can be kept open at a time, and the counters | ||
33 | can be poll()ed. | ||
34 | |||
35 | When creating a new counter fd, 'perf_counter_hw_event' is: | ||
36 | |||
37 | /* | ||
38 | * Event to monitor via a performance monitoring counter: | ||
39 | */ | ||
40 | struct perf_counter_hw_event { | ||
41 | __u64 event_config; | ||
42 | |||
43 | __u64 irq_period; | ||
44 | __u64 record_type; | ||
45 | __u64 read_format; | ||
46 | |||
47 | __u64 disabled : 1, /* off by default */ | ||
48 | nmi : 1, /* NMI sampling */ | ||
49 | inherit : 1, /* children inherit it */ | ||
50 | pinned : 1, /* must always be on PMU */ | ||
51 | exclusive : 1, /* only group on PMU */ | ||
52 | exclude_user : 1, /* don't count user */ | ||
53 | exclude_kernel : 1, /* ditto kernel */ | ||
54 | exclude_hv : 1, /* ditto hypervisor */ | ||
55 | exclude_idle : 1, /* don't count when idle */ | ||
56 | |||
57 | __reserved_1 : 55; | ||
58 | |||
59 | __u32 extra_config_len; | ||
60 | |||
61 | __u32 __reserved_4; | ||
62 | __u64 __reserved_2; | ||
63 | __u64 __reserved_3; | ||
64 | }; | ||
65 | |||
66 | The 'event_config' field specifies what the counter should count. It | ||
67 | is divided into 3 bit-fields: | ||
68 | |||
69 | raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 | ||
70 | type: 7 bits (next most significant) 0x7f00_0000_0000_0000 | ||
71 | event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 | ||
72 | |||
73 | If 'raw_type' is 1, then the counter will count a hardware event | ||
74 | specified by the remaining 63 bits of event_config. The encoding is | ||
75 | machine-specific. | ||
76 | |||
77 | If 'raw_type' is 0, then the 'type' field says what kind of counter | ||
78 | this is, with the following encoding: | ||
79 | |||
80 | enum perf_event_types { | ||
81 | PERF_TYPE_HARDWARE = 0, | ||
82 | PERF_TYPE_SOFTWARE = 1, | ||
83 | PERF_TYPE_TRACEPOINT = 2, | ||
84 | }; | ||
85 | |||
86 | A counter of PERF_TYPE_HARDWARE will count the hardware event | ||
87 | specified by 'event_id': | ||
88 | |||
89 | /* | ||
90 | * Generalized performance counter event types, used by the hw_event.event_id | ||
91 | * parameter of the sys_perf_counter_open() syscall: | ||
92 | */ | ||
93 | enum hw_event_ids { | ||
94 | /* | ||
95 | * Common hardware events, generalized by the kernel: | ||
96 | */ | ||
97 | PERF_COUNT_CPU_CYCLES = 0, | ||
98 | PERF_COUNT_INSTRUCTIONS = 1, | ||
99 | PERF_COUNT_CACHE_REFERENCES = 2, | ||
100 | PERF_COUNT_CACHE_MISSES = 3, | ||
101 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | ||
102 | PERF_COUNT_BRANCH_MISSES = 5, | ||
103 | PERF_COUNT_BUS_CYCLES = 6, | ||
104 | }; | ||
105 | |||
106 | These are standardized types of events that work relatively uniformly | ||
107 | on all CPUs that implement Performance Counters support under Linux, | ||
108 | although there may be variations (e.g., different CPUs might count | ||
109 | cache references and misses at different levels of the cache hierarchy). | ||
110 | If a CPU is not able to count the selected event, then the system call | ||
111 | will return -EINVAL. | ||
112 | |||
113 | More hw_event_types are supported as well, but they are CPU-specific | ||
114 | and accessed as raw events. For example, to count "External bus | ||
115 | cycles while bus lock signal asserted" events on Intel Core CPUs, pass | ||
116 | in a 0x4064 event_id value and set hw_event.raw_type to 1. | ||
117 | |||
118 | A counter of type PERF_TYPE_SOFTWARE will count one of the available | ||
119 | software events, selected by 'event_id': | ||
120 | |||
121 | /* | ||
122 | * Special "software" counters provided by the kernel, even if the hardware | ||
123 | * does not support performance counters. These counters measure various | ||
124 | * physical and sw events of the kernel (and allow the profiling of them as | ||
125 | * well): | ||
126 | */ | ||
127 | enum sw_event_ids { | ||
128 | PERF_COUNT_CPU_CLOCK = 0, | ||
129 | PERF_COUNT_TASK_CLOCK = 1, | ||
130 | PERF_COUNT_PAGE_FAULTS = 2, | ||
131 | PERF_COUNT_CONTEXT_SWITCHES = 3, | ||
132 | PERF_COUNT_CPU_MIGRATIONS = 4, | ||
133 | PERF_COUNT_PAGE_FAULTS_MIN = 5, | ||
134 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, | ||
135 | }; | ||
136 | |||
137 | Counters come in two flavours: counting counters and sampling | ||
138 | counters. A "counting" counter is one that is used for counting the | ||
139 | number of events that occur, and is characterised by having | ||
140 | irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a | ||
141 | counting counter simply returns the current value of the counter as | ||
142 | an 8-byte number. | ||
143 | |||
144 | A "sampling" counter is one that is set up to generate an interrupt | ||
145 | every N events, where N is given by 'irq_period'. A sampling counter | ||
146 | has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The | ||
147 | record_type controls what data is recorded on each interrupt, and the | ||
148 | available values are currently: | ||
149 | |||
150 | /* | ||
151 | * IRQ-notification data record type: | ||
152 | */ | ||
153 | enum perf_counter_record_type { | ||
154 | PERF_RECORD_SIMPLE = 0, | ||
155 | PERF_RECORD_IRQ = 1, | ||
156 | PERF_RECORD_GROUP = 2, | ||
157 | }; | ||
158 | |||
159 | A record_type value of PERF_RECORD_IRQ will record the instruction | ||
160 | pointer (IP) at which the interrupt occurred. A record_type value of | ||
161 | PERF_RECORD_GROUP will record the event_config and counter value of | ||
162 | all of the other counters in the group, and should only be used on a | ||
163 | group leader (see below). Currently these two values are mutually | ||
164 | exclusive, but record_type will become a bit-mask in future and | ||
165 | support other values. | ||
166 | |||
167 | A sampling counter has an event queue, into which an event is placed | ||
168 | on each interrupt. A read() on a sampling counter will read the next | ||
169 | event from the event queue. If the queue is empty, the read() will | ||
170 | either block or return an EAGAIN error, depending on whether the fd | ||
171 | has been set to non-blocking mode or not. | ||
172 | |||
173 | The 'disabled' bit specifies whether the counter starts out disabled | ||
174 | or enabled. If it is initially disabled, it can be enabled by ioctl | ||
175 | or prctl (see below). | ||
176 | |||
177 | The 'nmi' bit specifies, for hardware events, whether the counter | ||
178 | should be set up to request non-maskable interrupts (NMIs) or normal | ||
179 | interrupts. This bit is ignored if the user doesn't have | ||
180 | CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't | ||
181 | generate NMIs from hardware counters. | ||
182 | |||
183 | The 'inherit' bit, if set, specifies that this counter should count | ||
184 | events on descendant tasks as well as the task specified. This only | ||
185 | applies to new descendents, not to any existing descendents at the | ||
186 | time the counter is created (nor to any new descendents of existing | ||
187 | descendents). | ||
188 | |||
189 | The 'pinned' bit, if set, specifies that the counter should always be | ||
190 | on the CPU if at all possible. It only applies to hardware counters | ||
191 | and only to group leaders. If a pinned counter cannot be put onto the | ||
192 | CPU (e.g. because there are not enough hardware counters or because of | ||
193 | a conflict with some other event), then the counter goes into an | ||
194 | 'error' state, where reads return end-of-file (i.e. read() returns 0) | ||
195 | until the counter is subsequently enabled or disabled. | ||
196 | |||
197 | The 'exclusive' bit, if set, specifies that when this counter's group | ||
198 | is on the CPU, it should be the only group using the CPU's counters. | ||
199 | In future, this will allow sophisticated monitoring programs to supply | ||
200 | extra configuration information via 'extra_config_len' to exploit | ||
201 | advanced features of the CPU's Performance Monitor Unit (PMU) that are | ||
202 | not otherwise accessible and that might disrupt other hardware | ||
203 | counters. | ||
204 | |||
205 | The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a | ||
206 | way to request that counting of events be restricted to times when the | ||
207 | CPU is in user, kernel and/or hypervisor mode. | ||
208 | |||
209 | |||
210 | The 'pid' parameter to the perf_counter_open() system call allows the | ||
211 | counter to be specific to a task: | ||
212 | |||
213 | pid == 0: if the pid parameter is zero, the counter is attached to the | ||
214 | current task. | ||
215 | |||
216 | pid > 0: the counter is attached to a specific task (if the current task | ||
217 | has sufficient privilege to do so) | ||
218 | |||
219 | pid < 0: all tasks are counted (per cpu counters) | ||
220 | |||
221 | The 'cpu' parameter allows a counter to be made specific to a CPU: | ||
222 | |||
223 | cpu >= 0: the counter is restricted to a specific CPU | ||
224 | cpu == -1: the counter counts on all CPUs | ||
225 | |||
226 | (Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) | ||
227 | |||
228 | A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts | ||
229 | events of that task and 'follows' that task to whatever CPU the task | ||
230 | gets schedule to. Per task counters can be created by any user, for | ||
231 | their own tasks. | ||
232 | |||
233 | A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts | ||
234 | all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. | ||
235 | |||
236 | The 'flags' parameter is currently unused and must be zero. | ||
237 | |||
238 | The 'group_fd' parameter allows counter "groups" to be set up. A | ||
239 | counter group has one counter which is the group "leader". The leader | ||
240 | is created first, with group_fd = -1 in the perf_counter_open call | ||
241 | that creates it. The rest of the group members are created | ||
242 | subsequently, with group_fd giving the fd of the group leader. | ||
243 | (A single counter on its own is created with group_fd = -1 and is | ||
244 | considered to be a group with only 1 member.) | ||
245 | |||
246 | A counter group is scheduled onto the CPU as a unit, that is, it will | ||
247 | only be put onto the CPU if all of the counters in the group can be | ||
248 | put onto the CPU. This means that the values of the member counters | ||
249 | can be meaningfully compared, added, divided (to get ratios), etc., | ||
250 | with each other, since they have counted events for the same set of | ||
251 | executed instructions. | ||
252 | |||
253 | Counters can be enabled and disabled in two ways: via ioctl and via | ||
254 | prctl. When a counter is disabled, it doesn't count or generate | ||
255 | events but does continue to exist and maintain its count value. | ||
256 | |||
257 | An individual counter or counter group can be enabled with | ||
258 | |||
259 | ioctl(fd, PERF_COUNTER_IOC_ENABLE); | ||
260 | |||
261 | or disabled with | ||
262 | |||
263 | ioctl(fd, PERF_COUNTER_IOC_DISABLE); | ||
264 | |||
265 | Enabling or disabling the leader of a group enables or disables the | ||
266 | whole group; that is, while the group leader is disabled, none of the | ||
267 | counters in the group will count. Enabling or disabling a member of a | ||
268 | group other than the leader only affects that counter - disabling an | ||
269 | non-leader stops that counter from counting but doesn't affect any | ||
270 | other counter. | ||
271 | |||
272 | A process can enable or disable all the counter groups that are | ||
273 | attached to it, using prctl: | ||
274 | |||
275 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
276 | |||
277 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | ||
278 | |||
279 | This applies to all counters on the current process, whether created | ||
280 | by this process or by another, and doesn't affect any counters that | ||
281 | this process has created on other processes. It only enables or | ||
282 | disables the group leaders, not any other members in the groups. | ||
283 | |||
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c new file mode 100644 index 000000000000..15f3a5f90198 --- /dev/null +++ b/Documentation/perf_counter/kerneltop.c | |||
@@ -0,0 +1,1409 @@ | |||
1 | /* | ||
2 | * kerneltop.c: show top kernel functions - performance counters showcase | ||
3 | |||
4 | Build with: | ||
5 | |||
6 | cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt | ||
7 | |||
8 | Sample output: | ||
9 | |||
10 | ------------------------------------------------------------------------------ | ||
11 | KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) | ||
12 | ------------------------------------------------------------------------------ | ||
13 | |||
14 | weight RIP kernel function | ||
15 | ______ ________________ _______________ | ||
16 | |||
17 | 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev | ||
18 | 33.00 - ffffffff804cb740 : sock_alloc_send_skb | ||
19 | 31.26 - ffffffff804ce808 : skb_push | ||
20 | 22.43 - ffffffff80510004 : tcp_established_options | ||
21 | 19.00 - ffffffff8027d250 : find_get_page | ||
22 | 15.76 - ffffffff804e4fc9 : eth_type_trans | ||
23 | 15.20 - ffffffff804d8baa : dst_release | ||
24 | 14.86 - ffffffff804cf5d8 : skb_release_head_state | ||
25 | 14.00 - ffffffff802217d5 : read_hpet | ||
26 | 12.00 - ffffffff804ffb7f : __ip_local_out | ||
27 | 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish | ||
28 | 8.54 - ffffffff805001a3 : ip_queue_xmit | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * perfstat: /usr/bin/time -alike performance counter statistics utility | ||
33 | |||
34 | It summarizes the counter events of all tasks (and child tasks), | ||
35 | covering all CPUs that the command (or workload) executes on. | ||
36 | It only counts the per-task events of the workload started, | ||
37 | independent of how many other tasks run on those CPUs. | ||
38 | |||
39 | Sample output: | ||
40 | |||
41 | $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null | ||
42 | |||
43 | Performance counter stats for 'ls': | ||
44 | |||
45 | 163516953 instructions | ||
46 | 2295 cache-misses | ||
47 | 2855182 branch-misses | ||
48 | */ | ||
49 | |||
50 | /* | ||
51 | * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> | ||
52 | * | ||
53 | * Improvements and fixes by: | ||
54 | * | ||
55 | * Arjan van de Ven <arjan@linux.intel.com> | ||
56 | * Yanmin Zhang <yanmin.zhang@intel.com> | ||
57 | * Wu Fengguang <fengguang.wu@intel.com> | ||
58 | * Mike Galbraith <efault@gmx.de> | ||
59 | * Paul Mackerras <paulus@samba.org> | ||
60 | * | ||
61 | * Released under the GPL v2. (and only v2, not any later version) | ||
62 | */ | ||
63 | |||
64 | #define _GNU_SOURCE | ||
65 | #include <sys/types.h> | ||
66 | #include <sys/stat.h> | ||
67 | #include <sys/time.h> | ||
68 | #include <unistd.h> | ||
69 | #include <stdint.h> | ||
70 | #include <stdlib.h> | ||
71 | #include <string.h> | ||
72 | #include <limits.h> | ||
73 | #include <getopt.h> | ||
74 | #include <assert.h> | ||
75 | #include <fcntl.h> | ||
76 | #include <stdio.h> | ||
77 | #include <errno.h> | ||
78 | #include <ctype.h> | ||
79 | #include <time.h> | ||
80 | #include <sched.h> | ||
81 | #include <pthread.h> | ||
82 | |||
83 | #include <sys/syscall.h> | ||
84 | #include <sys/ioctl.h> | ||
85 | #include <sys/poll.h> | ||
86 | #include <sys/prctl.h> | ||
87 | #include <sys/wait.h> | ||
88 | #include <sys/uio.h> | ||
89 | #include <sys/mman.h> | ||
90 | |||
91 | #include <linux/unistd.h> | ||
92 | #include <linux/types.h> | ||
93 | |||
94 | #include "../../include/linux/perf_counter.h" | ||
95 | |||
96 | |||
97 | /* | ||
98 | * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all | ||
99 | * counters in the current task. | ||
100 | */ | ||
101 | #define PR_TASK_PERF_COUNTERS_DISABLE 31 | ||
102 | #define PR_TASK_PERF_COUNTERS_ENABLE 32 | ||
103 | |||
104 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
105 | |||
106 | #define rdclock() \ | ||
107 | ({ \ | ||
108 | struct timespec ts; \ | ||
109 | \ | ||
110 | clock_gettime(CLOCK_MONOTONIC, &ts); \ | ||
111 | ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ | ||
112 | }) | ||
113 | |||
114 | /* | ||
115 | * Pick up some kernel type conventions: | ||
116 | */ | ||
117 | #define __user | ||
118 | #define asmlinkage | ||
119 | |||
120 | #ifdef __x86_64__ | ||
121 | #define __NR_perf_counter_open 295 | ||
122 | #define rmb() asm volatile("lfence" ::: "memory") | ||
123 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); | ||
124 | #endif | ||
125 | |||
126 | #ifdef __i386__ | ||
127 | #define __NR_perf_counter_open 333 | ||
128 | #define rmb() asm volatile("lfence" ::: "memory") | ||
129 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); | ||
130 | #endif | ||
131 | |||
132 | #ifdef __powerpc__ | ||
133 | #define __NR_perf_counter_open 319 | ||
134 | #define rmb() asm volatile ("sync" ::: "memory") | ||
135 | #define cpu_relax() asm volatile ("" ::: "memory"); | ||
136 | #endif | ||
137 | |||
138 | #define unlikely(x) __builtin_expect(!!(x), 0) | ||
139 | #define min(x, y) ({ \ | ||
140 | typeof(x) _min1 = (x); \ | ||
141 | typeof(y) _min2 = (y); \ | ||
142 | (void) (&_min1 == &_min2); \ | ||
143 | _min1 < _min2 ? _min1 : _min2; }) | ||
144 | |||
145 | asmlinkage int sys_perf_counter_open( | ||
146 | struct perf_counter_hw_event *hw_event_uptr __user, | ||
147 | pid_t pid, | ||
148 | int cpu, | ||
149 | int group_fd, | ||
150 | unsigned long flags) | ||
151 | { | ||
152 | return syscall( | ||
153 | __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); | ||
154 | } | ||
155 | |||
156 | #define MAX_COUNTERS 64 | ||
157 | #define MAX_NR_CPUS 256 | ||
158 | |||
159 | #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) | ||
160 | |||
161 | static int run_perfstat = 0; | ||
162 | static int system_wide = 0; | ||
163 | |||
164 | static int nr_counters = 0; | ||
165 | static __u64 event_id[MAX_COUNTERS] = { | ||
166 | EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), | ||
167 | EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), | ||
168 | EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), | ||
169 | EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), | ||
170 | |||
171 | EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), | ||
172 | EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), | ||
173 | EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), | ||
174 | EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), | ||
175 | }; | ||
176 | static int default_interval = 100000; | ||
177 | static int event_count[MAX_COUNTERS]; | ||
178 | static int fd[MAX_NR_CPUS][MAX_COUNTERS]; | ||
179 | |||
180 | static __u64 count_filter = 100; | ||
181 | |||
182 | static int tid = -1; | ||
183 | static int profile_cpu = -1; | ||
184 | static int nr_cpus = 0; | ||
185 | static int nmi = 1; | ||
186 | static unsigned int realtime_prio = 0; | ||
187 | static int group = 0; | ||
188 | static unsigned int page_size; | ||
189 | static unsigned int mmap_pages = 16; | ||
190 | static int use_mmap = 0; | ||
191 | static int use_munmap = 0; | ||
192 | |||
193 | static char *vmlinux; | ||
194 | |||
195 | static char *sym_filter; | ||
196 | static unsigned long filter_start; | ||
197 | static unsigned long filter_end; | ||
198 | |||
199 | static int delay_secs = 2; | ||
200 | static int zero; | ||
201 | static int dump_symtab; | ||
202 | |||
203 | static int scale; | ||
204 | |||
205 | struct source_line { | ||
206 | uint64_t EIP; | ||
207 | unsigned long count; | ||
208 | char *line; | ||
209 | struct source_line *next; | ||
210 | }; | ||
211 | |||
212 | static struct source_line *lines; | ||
213 | static struct source_line **lines_tail; | ||
214 | |||
215 | const unsigned int default_count[] = { | ||
216 | 1000000, | ||
217 | 1000000, | ||
218 | 10000, | ||
219 | 10000, | ||
220 | 1000000, | ||
221 | 10000, | ||
222 | }; | ||
223 | |||
224 | static char *hw_event_names[] = { | ||
225 | "CPU cycles", | ||
226 | "instructions", | ||
227 | "cache references", | ||
228 | "cache misses", | ||
229 | "branches", | ||
230 | "branch misses", | ||
231 | "bus cycles", | ||
232 | }; | ||
233 | |||
234 | static char *sw_event_names[] = { | ||
235 | "cpu clock ticks", | ||
236 | "task clock ticks", | ||
237 | "pagefaults", | ||
238 | "context switches", | ||
239 | "CPU migrations", | ||
240 | "minor faults", | ||
241 | "major faults", | ||
242 | }; | ||
243 | |||
244 | struct event_symbol { | ||
245 | __u64 event; | ||
246 | char *symbol; | ||
247 | }; | ||
248 | |||
249 | static struct event_symbol event_symbols[] = { | ||
250 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, | ||
251 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, | ||
252 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, | ||
253 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, | ||
254 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, | ||
255 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, | ||
256 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, | ||
257 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, | ||
258 | {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, | ||
259 | |||
260 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, | ||
261 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, | ||
262 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, | ||
263 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, | ||
264 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, | ||
265 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, | ||
266 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, | ||
267 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, | ||
268 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, | ||
269 | {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, | ||
270 | }; | ||
271 | |||
272 | #define __PERF_COUNTER_FIELD(config, name) \ | ||
273 | ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) | ||
274 | |||
275 | #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) | ||
276 | #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) | ||
277 | #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) | ||
278 | #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) | ||
279 | |||
280 | static void display_events_help(void) | ||
281 | { | ||
282 | unsigned int i; | ||
283 | __u64 e; | ||
284 | |||
285 | printf( | ||
286 | " -e EVENT --event=EVENT # symbolic-name abbreviations"); | ||
287 | |||
288 | for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { | ||
289 | int type, id; | ||
290 | |||
291 | e = event_symbols[i].event; | ||
292 | type = PERF_COUNTER_TYPE(e); | ||
293 | id = PERF_COUNTER_ID(e); | ||
294 | |||
295 | printf("\n %d:%d: %-20s", | ||
296 | type, id, event_symbols[i].symbol); | ||
297 | } | ||
298 | |||
299 | printf("\n" | ||
300 | " rNNN: raw PMU events (eventsel+umask)\n\n"); | ||
301 | } | ||
302 | |||
303 | static void display_perfstat_help(void) | ||
304 | { | ||
305 | printf( | ||
306 | "Usage: perfstat [<events...>] <cmd...>\n\n" | ||
307 | "PerfStat Options (up to %d event types can be specified):\n\n", | ||
308 | MAX_COUNTERS); | ||
309 | |||
310 | display_events_help(); | ||
311 | |||
312 | printf( | ||
313 | " -l # scale counter values\n" | ||
314 | " -a # system-wide collection\n"); | ||
315 | exit(0); | ||
316 | } | ||
317 | |||
318 | static void display_help(void) | ||
319 | { | ||
320 | if (run_perfstat) | ||
321 | return display_perfstat_help(); | ||
322 | |||
323 | printf( | ||
324 | "Usage: kerneltop [<options>]\n" | ||
325 | " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n" | ||
326 | "KernelTop Options (up to %d event types can be specified at once):\n\n", | ||
327 | MAX_COUNTERS); | ||
328 | |||
329 | display_events_help(); | ||
330 | |||
331 | printf( | ||
332 | " -S --stat # perfstat COMMAND\n" | ||
333 | " -a # system-wide collection (for perfstat)\n\n" | ||
334 | " -c CNT --count=CNT # event period to sample\n\n" | ||
335 | " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" | ||
336 | " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" | ||
337 | " -l # show scale factor for RR events\n" | ||
338 | " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n" | ||
339 | " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" | ||
340 | " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n" | ||
341 | " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n" | ||
342 | " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n" | ||
343 | " -z --zero # zero counts after display\n" | ||
344 | " -D --dump_symtab # dump symbol table to stderr on startup\n" | ||
345 | " -m pages --mmap_pages=<pages> # number of mmap data pages\n" | ||
346 | " -M --mmap_info # print mmap info stream\n" | ||
347 | " -U --munmap_info # print munmap info stream\n" | ||
348 | ); | ||
349 | |||
350 | exit(0); | ||
351 | } | ||
352 | |||
353 | static char *event_name(int ctr) | ||
354 | { | ||
355 | __u64 config = event_id[ctr]; | ||
356 | int type = PERF_COUNTER_TYPE(config); | ||
357 | int id = PERF_COUNTER_ID(config); | ||
358 | static char buf[32]; | ||
359 | |||
360 | if (PERF_COUNTER_RAW(config)) { | ||
361 | sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); | ||
362 | return buf; | ||
363 | } | ||
364 | |||
365 | switch (type) { | ||
366 | case PERF_TYPE_HARDWARE: | ||
367 | if (id < PERF_HW_EVENTS_MAX) | ||
368 | return hw_event_names[id]; | ||
369 | return "unknown-hardware"; | ||
370 | |||
371 | case PERF_TYPE_SOFTWARE: | ||
372 | if (id < PERF_SW_EVENTS_MAX) | ||
373 | return sw_event_names[id]; | ||
374 | return "unknown-software"; | ||
375 | |||
376 | default: | ||
377 | break; | ||
378 | } | ||
379 | |||
380 | return "unknown"; | ||
381 | } | ||
382 | |||
383 | /* | ||
384 | * Each event can have multiple symbolic names. | ||
385 | * Symbolic names are (almost) exactly matched. | ||
386 | */ | ||
387 | static __u64 match_event_symbols(char *str) | ||
388 | { | ||
389 | __u64 config, id; | ||
390 | int type; | ||
391 | unsigned int i; | ||
392 | |||
393 | if (sscanf(str, "r%llx", &config) == 1) | ||
394 | return config | PERF_COUNTER_RAW_MASK; | ||
395 | |||
396 | if (sscanf(str, "%d:%llu", &type, &id) == 2) | ||
397 | return EID(type, id); | ||
398 | |||
399 | for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { | ||
400 | if (!strncmp(str, event_symbols[i].symbol, | ||
401 | strlen(event_symbols[i].symbol))) | ||
402 | return event_symbols[i].event; | ||
403 | } | ||
404 | |||
405 | return ~0ULL; | ||
406 | } | ||
407 | |||
408 | static int parse_events(char *str) | ||
409 | { | ||
410 | __u64 config; | ||
411 | |||
412 | again: | ||
413 | if (nr_counters == MAX_COUNTERS) | ||
414 | return -1; | ||
415 | |||
416 | config = match_event_symbols(str); | ||
417 | if (config == ~0ULL) | ||
418 | return -1; | ||
419 | |||
420 | event_id[nr_counters] = config; | ||
421 | nr_counters++; | ||
422 | |||
423 | str = strstr(str, ","); | ||
424 | if (str) { | ||
425 | str++; | ||
426 | goto again; | ||
427 | } | ||
428 | |||
429 | return 0; | ||
430 | } | ||
431 | |||
432 | |||
433 | /* | ||
434 | * perfstat | ||
435 | */ | ||
436 | |||
437 | char fault_here[1000000]; | ||
438 | |||
439 | static void create_perfstat_counter(int counter) | ||
440 | { | ||
441 | struct perf_counter_hw_event hw_event; | ||
442 | |||
443 | memset(&hw_event, 0, sizeof(hw_event)); | ||
444 | hw_event.config = event_id[counter]; | ||
445 | hw_event.record_type = 0; | ||
446 | hw_event.nmi = 0; | ||
447 | if (scale) | ||
448 | hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | | ||
449 | PERF_FORMAT_TOTAL_TIME_RUNNING; | ||
450 | |||
451 | if (system_wide) { | ||
452 | int cpu; | ||
453 | for (cpu = 0; cpu < nr_cpus; cpu ++) { | ||
454 | fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); | ||
455 | if (fd[cpu][counter] < 0) { | ||
456 | printf("perfstat error: syscall returned with %d (%s)\n", | ||
457 | fd[cpu][counter], strerror(errno)); | ||
458 | exit(-1); | ||
459 | } | ||
460 | } | ||
461 | } else { | ||
462 | hw_event.inherit = 1; | ||
463 | hw_event.disabled = 1; | ||
464 | |||
465 | fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); | ||
466 | if (fd[0][counter] < 0) { | ||
467 | printf("perfstat error: syscall returned with %d (%s)\n", | ||
468 | fd[0][counter], strerror(errno)); | ||
469 | exit(-1); | ||
470 | } | ||
471 | } | ||
472 | } | ||
473 | |||
474 | int do_perfstat(int argc, char *argv[]) | ||
475 | { | ||
476 | unsigned long long t0, t1; | ||
477 | int counter; | ||
478 | ssize_t res; | ||
479 | int status; | ||
480 | int pid; | ||
481 | |||
482 | if (!system_wide) | ||
483 | nr_cpus = 1; | ||
484 | |||
485 | for (counter = 0; counter < nr_counters; counter++) | ||
486 | create_perfstat_counter(counter); | ||
487 | |||
488 | argc -= optind; | ||
489 | argv += optind; | ||
490 | |||
491 | if (!argc) | ||
492 | display_help(); | ||
493 | |||
494 | /* | ||
495 | * Enable counters and exec the command: | ||
496 | */ | ||
497 | t0 = rdclock(); | ||
498 | prctl(PR_TASK_PERF_COUNTERS_ENABLE); | ||
499 | |||
500 | if ((pid = fork()) < 0) | ||
501 | perror("failed to fork"); | ||
502 | if (!pid) { | ||
503 | if (execvp(argv[0], argv)) { | ||
504 | perror(argv[0]); | ||
505 | exit(-1); | ||
506 | } | ||
507 | } | ||
508 | while (wait(&status) >= 0) | ||
509 | ; | ||
510 | prctl(PR_TASK_PERF_COUNTERS_DISABLE); | ||
511 | t1 = rdclock(); | ||
512 | |||
513 | fflush(stdout); | ||
514 | |||
515 | fprintf(stderr, "\n"); | ||
516 | fprintf(stderr, " Performance counter stats for \'%s\':\n", | ||
517 | argv[0]); | ||
518 | fprintf(stderr, "\n"); | ||
519 | |||
520 | for (counter = 0; counter < nr_counters; counter++) { | ||
521 | int cpu, nv; | ||
522 | __u64 count[3], single_count[3]; | ||
523 | int scaled; | ||
524 | |||
525 | count[0] = count[1] = count[2] = 0; | ||
526 | nv = scale ? 3 : 1; | ||
527 | for (cpu = 0; cpu < nr_cpus; cpu ++) { | ||
528 | res = read(fd[cpu][counter], | ||
529 | single_count, nv * sizeof(__u64)); | ||
530 | assert(res == nv * sizeof(__u64)); | ||
531 | |||
532 | count[0] += single_count[0]; | ||
533 | if (scale) { | ||
534 | count[1] += single_count[1]; | ||
535 | count[2] += single_count[2]; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | scaled = 0; | ||
540 | if (scale) { | ||
541 | if (count[2] == 0) { | ||
542 | fprintf(stderr, " %14s %-20s\n", | ||
543 | "<not counted>", event_name(counter)); | ||
544 | continue; | ||
545 | } | ||
546 | if (count[2] < count[1]) { | ||
547 | scaled = 1; | ||
548 | count[0] = (unsigned long long) | ||
549 | ((double)count[0] * count[1] / count[2] + 0.5); | ||
550 | } | ||
551 | } | ||
552 | |||
553 | if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || | ||
554 | event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { | ||
555 | |||
556 | double msecs = (double)count[0] / 1000000; | ||
557 | |||
558 | fprintf(stderr, " %14.6f %-20s (msecs)", | ||
559 | msecs, event_name(counter)); | ||
560 | } else { | ||
561 | fprintf(stderr, " %14Ld %-20s (events)", | ||
562 | count[0], event_name(counter)); | ||
563 | } | ||
564 | if (scaled) | ||
565 | fprintf(stderr, " (scaled from %.2f%%)", | ||
566 | (double) count[2] / count[1] * 100); | ||
567 | fprintf(stderr, "\n"); | ||
568 | } | ||
569 | fprintf(stderr, "\n"); | ||
570 | fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", | ||
571 | (double)(t1-t0)/1e6); | ||
572 | fprintf(stderr, "\n"); | ||
573 | |||
574 | return 0; | ||
575 | } | ||
576 | |||
577 | /* | ||
578 | * Symbols | ||
579 | */ | ||
580 | |||
581 | static uint64_t min_ip; | ||
582 | static uint64_t max_ip = -1ll; | ||
583 | |||
584 | struct sym_entry { | ||
585 | unsigned long long addr; | ||
586 | char *sym; | ||
587 | unsigned long count[MAX_COUNTERS]; | ||
588 | int skip; | ||
589 | struct source_line *source; | ||
590 | }; | ||
591 | |||
592 | #define MAX_SYMS 100000 | ||
593 | |||
594 | static int sym_table_count; | ||
595 | |||
596 | struct sym_entry *sym_filter_entry; | ||
597 | |||
598 | static struct sym_entry sym_table[MAX_SYMS]; | ||
599 | |||
600 | static void show_details(struct sym_entry *sym); | ||
601 | |||
602 | /* | ||
603 | * Ordering weight: count-1 * count-2 * ... / count-n | ||
604 | */ | ||
605 | static double sym_weight(const struct sym_entry *sym) | ||
606 | { | ||
607 | double weight; | ||
608 | int counter; | ||
609 | |||
610 | weight = sym->count[0]; | ||
611 | |||
612 | for (counter = 1; counter < nr_counters-1; counter++) | ||
613 | weight *= sym->count[counter]; | ||
614 | |||
615 | weight /= (sym->count[counter] + 1); | ||
616 | |||
617 | return weight; | ||
618 | } | ||
619 | |||
620 | static int compare(const void *__sym1, const void *__sym2) | ||
621 | { | ||
622 | const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; | ||
623 | |||
624 | return sym_weight(sym1) < sym_weight(sym2); | ||
625 | } | ||
626 | |||
627 | static long events; | ||
628 | static long userspace_events; | ||
629 | static const char CONSOLE_CLEAR[] = "[H[2J"; | ||
630 | |||
631 | static struct sym_entry tmp[MAX_SYMS]; | ||
632 | |||
633 | static void print_sym_table(void) | ||
634 | { | ||
635 | int i, printed; | ||
636 | int counter; | ||
637 | float events_per_sec = events/delay_secs; | ||
638 | float kevents_per_sec = (events-userspace_events)/delay_secs; | ||
639 | float sum_kevents = 0.0; | ||
640 | |||
641 | events = userspace_events = 0; | ||
642 | memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); | ||
643 | qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); | ||
644 | |||
645 | for (i = 0; i < sym_table_count && tmp[i].count[0]; i++) | ||
646 | sum_kevents += tmp[i].count[0]; | ||
647 | |||
648 | write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); | ||
649 | |||
650 | printf( | ||
651 | "------------------------------------------------------------------------------\n"); | ||
652 | printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ", | ||
653 | events_per_sec, | ||
654 | 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), | ||
655 | nmi ? "NMI" : "IRQ"); | ||
656 | |||
657 | if (nr_counters == 1) | ||
658 | printf("%d ", event_count[0]); | ||
659 | |||
660 | for (counter = 0; counter < nr_counters; counter++) { | ||
661 | if (counter) | ||
662 | printf("/"); | ||
663 | |||
664 | printf("%s", event_name(counter)); | ||
665 | } | ||
666 | |||
667 | printf( "], "); | ||
668 | |||
669 | if (tid != -1) | ||
670 | printf(" (tid: %d", tid); | ||
671 | else | ||
672 | printf(" (all"); | ||
673 | |||
674 | if (profile_cpu != -1) | ||
675 | printf(", cpu: %d)\n", profile_cpu); | ||
676 | else { | ||
677 | if (tid != -1) | ||
678 | printf(")\n"); | ||
679 | else | ||
680 | printf(", %d CPUs)\n", nr_cpus); | ||
681 | } | ||
682 | |||
683 | printf("------------------------------------------------------------------------------\n\n"); | ||
684 | |||
685 | if (nr_counters == 1) | ||
686 | printf(" events pcnt"); | ||
687 | else | ||
688 | printf(" weight events pcnt"); | ||
689 | |||
690 | printf(" RIP kernel function\n" | ||
691 | " ______ ______ _____ ________________ _______________\n\n" | ||
692 | ); | ||
693 | |||
694 | for (i = 0, printed = 0; i < sym_table_count; i++) { | ||
695 | float pcnt; | ||
696 | int count; | ||
697 | |||
698 | if (printed <= 18 && tmp[i].count[0] >= count_filter) { | ||
699 | pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); | ||
700 | |||
701 | if (nr_counters == 1) | ||
702 | printf("%19.2f - %4.1f%% - %016llx : %s\n", | ||
703 | sym_weight(tmp + i), | ||
704 | pcnt, tmp[i].addr, tmp[i].sym); | ||
705 | else | ||
706 | printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", | ||
707 | sym_weight(tmp + i), | ||
708 | tmp[i].count[0], | ||
709 | pcnt, tmp[i].addr, tmp[i].sym); | ||
710 | printed++; | ||
711 | } | ||
712 | /* | ||
713 | * Add decay to the counts: | ||
714 | */ | ||
715 | for (count = 0; count < nr_counters; count++) | ||
716 | sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; | ||
717 | } | ||
718 | |||
719 | if (sym_filter_entry) | ||
720 | show_details(sym_filter_entry); | ||
721 | |||
722 | { | ||
723 | struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; | ||
724 | |||
725 | if (poll(&stdin_poll, 1, 0) == 1) { | ||
726 | printf("key pressed - exiting.\n"); | ||
727 | exit(0); | ||
728 | } | ||
729 | } | ||
730 | } | ||
731 | |||
732 | static void *display_thread(void *arg) | ||
733 | { | ||
734 | printf("KernelTop refresh period: %d seconds\n", delay_secs); | ||
735 | |||
736 | while (!sleep(delay_secs)) | ||
737 | print_sym_table(); | ||
738 | |||
739 | return NULL; | ||
740 | } | ||
741 | |||
742 | static int read_symbol(FILE *in, struct sym_entry *s) | ||
743 | { | ||
744 | static int filter_match = 0; | ||
745 | char *sym, stype; | ||
746 | char str[500]; | ||
747 | int rc, pos; | ||
748 | |||
749 | rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str); | ||
750 | if (rc == EOF) | ||
751 | return -1; | ||
752 | |||
753 | assert(rc == 3); | ||
754 | |||
755 | /* skip until end of line: */ | ||
756 | pos = strlen(str); | ||
757 | do { | ||
758 | rc = fgetc(in); | ||
759 | if (rc == '\n' || rc == EOF || pos >= 499) | ||
760 | break; | ||
761 | str[pos] = rc; | ||
762 | pos++; | ||
763 | } while (1); | ||
764 | str[pos] = 0; | ||
765 | |||
766 | sym = str; | ||
767 | |||
768 | /* Filter out known duplicates and non-text symbols. */ | ||
769 | if (!strcmp(sym, "_text")) | ||
770 | return 1; | ||
771 | if (!min_ip && !strcmp(sym, "_stext")) | ||
772 | return 1; | ||
773 | if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext")) | ||
774 | return 1; | ||
775 | if (stype != 'T' && stype != 't') | ||
776 | return 1; | ||
777 | if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14)) | ||
778 | return 1; | ||
779 | if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) | ||
780 | return 1; | ||
781 | |||
782 | s->sym = malloc(strlen(str)); | ||
783 | assert(s->sym); | ||
784 | |||
785 | strcpy((char *)s->sym, str); | ||
786 | s->skip = 0; | ||
787 | |||
788 | /* Tag events to be skipped. */ | ||
789 | if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) | ||
790 | s->skip = 1; | ||
791 | else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) | ||
792 | s->skip = 1; | ||
793 | else if (!strcmp("mwait_idle", s->sym)) | ||
794 | s->skip = 1; | ||
795 | |||
796 | if (filter_match == 1) { | ||
797 | filter_end = s->addr; | ||
798 | filter_match = -1; | ||
799 | if (filter_end - filter_start > 10000) { | ||
800 | printf("hm, too large filter symbol <%s> - skipping.\n", | ||
801 | sym_filter); | ||
802 | printf("symbol filter start: %016lx\n", filter_start); | ||
803 | printf(" end: %016lx\n", filter_end); | ||
804 | filter_end = filter_start = 0; | ||
805 | sym_filter = NULL; | ||
806 | sleep(1); | ||
807 | } | ||
808 | } | ||
809 | if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) { | ||
810 | filter_match = 1; | ||
811 | filter_start = s->addr; | ||
812 | } | ||
813 | |||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | int compare_addr(const void *__sym1, const void *__sym2) | ||
818 | { | ||
819 | const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; | ||
820 | |||
821 | return sym1->addr > sym2->addr; | ||
822 | } | ||
823 | |||
824 | static void sort_symbol_table(void) | ||
825 | { | ||
826 | int i, dups; | ||
827 | |||
828 | do { | ||
829 | qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr); | ||
830 | for (i = 0, dups = 0; i < sym_table_count; i++) { | ||
831 | if (sym_table[i].addr == sym_table[i+1].addr) { | ||
832 | sym_table[i+1].addr = -1ll; | ||
833 | dups++; | ||
834 | } | ||
835 | } | ||
836 | sym_table_count -= dups; | ||
837 | } while(dups); | ||
838 | } | ||
839 | |||
840 | static void parse_symbols(void) | ||
841 | { | ||
842 | struct sym_entry *last; | ||
843 | |||
844 | FILE *kallsyms = fopen("/proc/kallsyms", "r"); | ||
845 | |||
846 | if (!kallsyms) { | ||
847 | printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n"); | ||
848 | exit(-1); | ||
849 | } | ||
850 | |||
851 | while (!feof(kallsyms)) { | ||
852 | if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) { | ||
853 | sym_table_count++; | ||
854 | assert(sym_table_count <= MAX_SYMS); | ||
855 | } | ||
856 | } | ||
857 | |||
858 | sort_symbol_table(); | ||
859 | min_ip = sym_table[0].addr; | ||
860 | max_ip = sym_table[sym_table_count-1].addr; | ||
861 | last = sym_table + sym_table_count++; | ||
862 | |||
863 | last->addr = -1ll; | ||
864 | last->sym = "<end>"; | ||
865 | |||
866 | if (filter_end) { | ||
867 | int count; | ||
868 | for (count=0; count < sym_table_count; count ++) { | ||
869 | if (!strcmp(sym_table[count].sym, sym_filter)) { | ||
870 | sym_filter_entry = &sym_table[count]; | ||
871 | break; | ||
872 | } | ||
873 | } | ||
874 | } | ||
875 | if (dump_symtab) { | ||
876 | int i; | ||
877 | |||
878 | for (i = 0; i < sym_table_count; i++) | ||
879 | fprintf(stderr, "%llx %s\n", | ||
880 | sym_table[i].addr, sym_table[i].sym); | ||
881 | } | ||
882 | } | ||
883 | |||
884 | /* | ||
885 | * Source lines | ||
886 | */ | ||
887 | |||
888 | static void parse_vmlinux(char *filename) | ||
889 | { | ||
890 | FILE *file; | ||
891 | char command[PATH_MAX*2]; | ||
892 | if (!filename) | ||
893 | return; | ||
894 | |||
895 | sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename); | ||
896 | |||
897 | file = popen(command, "r"); | ||
898 | if (!file) | ||
899 | return; | ||
900 | |||
901 | lines_tail = &lines; | ||
902 | while (!feof(file)) { | ||
903 | struct source_line *src; | ||
904 | size_t dummy = 0; | ||
905 | char *c; | ||
906 | |||
907 | src = malloc(sizeof(struct source_line)); | ||
908 | assert(src != NULL); | ||
909 | memset(src, 0, sizeof(struct source_line)); | ||
910 | |||
911 | if (getline(&src->line, &dummy, file) < 0) | ||
912 | break; | ||
913 | if (!src->line) | ||
914 | break; | ||
915 | |||
916 | c = strchr(src->line, '\n'); | ||
917 | if (c) | ||
918 | *c = 0; | ||
919 | |||
920 | src->next = NULL; | ||
921 | *lines_tail = src; | ||
922 | lines_tail = &src->next; | ||
923 | |||
924 | if (strlen(src->line)>8 && src->line[8] == ':') | ||
925 | src->EIP = strtoull(src->line, NULL, 16); | ||
926 | if (strlen(src->line)>8 && src->line[16] == ':') | ||
927 | src->EIP = strtoull(src->line, NULL, 16); | ||
928 | } | ||
929 | pclose(file); | ||
930 | } | ||
931 | |||
932 | static void record_precise_ip(uint64_t ip) | ||
933 | { | ||
934 | struct source_line *line; | ||
935 | |||
936 | for (line = lines; line; line = line->next) { | ||
937 | if (line->EIP == ip) | ||
938 | line->count++; | ||
939 | if (line->EIP > ip) | ||
940 | break; | ||
941 | } | ||
942 | } | ||
943 | |||
944 | static void lookup_sym_in_vmlinux(struct sym_entry *sym) | ||
945 | { | ||
946 | struct source_line *line; | ||
947 | char pattern[PATH_MAX]; | ||
948 | sprintf(pattern, "<%s>:", sym->sym); | ||
949 | |||
950 | for (line = lines; line; line = line->next) { | ||
951 | if (strstr(line->line, pattern)) { | ||
952 | sym->source = line; | ||
953 | break; | ||
954 | } | ||
955 | } | ||
956 | } | ||
957 | |||
958 | static void show_lines(struct source_line *line_queue, int line_queue_count) | ||
959 | { | ||
960 | int i; | ||
961 | struct source_line *line; | ||
962 | |||
963 | line = line_queue; | ||
964 | for (i = 0; i < line_queue_count; i++) { | ||
965 | printf("%8li\t%s\n", line->count, line->line); | ||
966 | line = line->next; | ||
967 | } | ||
968 | } | ||
969 | |||
970 | #define TRACE_COUNT 3 | ||
971 | |||
972 | static void show_details(struct sym_entry *sym) | ||
973 | { | ||
974 | struct source_line *line; | ||
975 | struct source_line *line_queue = NULL; | ||
976 | int displayed = 0; | ||
977 | int line_queue_count = 0; | ||
978 | |||
979 | if (!sym->source) | ||
980 | lookup_sym_in_vmlinux(sym); | ||
981 | if (!sym->source) | ||
982 | return; | ||
983 | |||
984 | printf("Showing details for %s\n", sym->sym); | ||
985 | |||
986 | line = sym->source; | ||
987 | while (line) { | ||
988 | if (displayed && strstr(line->line, ">:")) | ||
989 | break; | ||
990 | |||
991 | if (!line_queue_count) | ||
992 | line_queue = line; | ||
993 | line_queue_count ++; | ||
994 | |||
995 | if (line->count >= count_filter) { | ||
996 | show_lines(line_queue, line_queue_count); | ||
997 | line_queue_count = 0; | ||
998 | line_queue = NULL; | ||
999 | } else if (line_queue_count > TRACE_COUNT) { | ||
1000 | line_queue = line_queue->next; | ||
1001 | line_queue_count --; | ||
1002 | } | ||
1003 | |||
1004 | line->count = 0; | ||
1005 | displayed++; | ||
1006 | if (displayed > 300) | ||
1007 | break; | ||
1008 | line = line->next; | ||
1009 | } | ||
1010 | } | ||
1011 | |||
1012 | /* | ||
1013 | * Binary search in the histogram table and record the hit: | ||
1014 | */ | ||
1015 | static void record_ip(uint64_t ip, int counter) | ||
1016 | { | ||
1017 | int left_idx, middle_idx, right_idx, idx; | ||
1018 | unsigned long left, middle, right; | ||
1019 | |||
1020 | record_precise_ip(ip); | ||
1021 | |||
1022 | left_idx = 0; | ||
1023 | right_idx = sym_table_count-1; | ||
1024 | assert(ip <= max_ip && ip >= min_ip); | ||
1025 | |||
1026 | while (left_idx + 1 < right_idx) { | ||
1027 | middle_idx = (left_idx + right_idx) / 2; | ||
1028 | |||
1029 | left = sym_table[ left_idx].addr; | ||
1030 | middle = sym_table[middle_idx].addr; | ||
1031 | right = sym_table[ right_idx].addr; | ||
1032 | |||
1033 | if (!(left <= middle && middle <= right)) { | ||
1034 | printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right); | ||
1035 | printf("%d %d %d\n", left_idx, middle_idx, right_idx); | ||
1036 | } | ||
1037 | assert(left <= middle && middle <= right); | ||
1038 | if (!(left <= ip && ip <= right)) { | ||
1039 | printf(" left: %016lx\n", left); | ||
1040 | printf(" ip: %016lx\n", (unsigned long)ip); | ||
1041 | printf("right: %016lx\n", right); | ||
1042 | } | ||
1043 | assert(left <= ip && ip <= right); | ||
1044 | /* | ||
1045 | * [ left .... target .... middle .... right ] | ||
1046 | * => right := middle | ||
1047 | */ | ||
1048 | if (ip < middle) { | ||
1049 | right_idx = middle_idx; | ||
1050 | continue; | ||
1051 | } | ||
1052 | /* | ||
1053 | * [ left .... middle ... target ... right ] | ||
1054 | * => left := middle | ||
1055 | */ | ||
1056 | left_idx = middle_idx; | ||
1057 | } | ||
1058 | |||
1059 | idx = left_idx; | ||
1060 | |||
1061 | if (!sym_table[idx].skip) | ||
1062 | sym_table[idx].count[counter]++; | ||
1063 | else events--; | ||
1064 | } | ||
1065 | |||
1066 | static void process_event(uint64_t ip, int counter) | ||
1067 | { | ||
1068 | events++; | ||
1069 | |||
1070 | if (ip < min_ip || ip > max_ip) { | ||
1071 | userspace_events++; | ||
1072 | return; | ||
1073 | } | ||
1074 | |||
1075 | record_ip(ip, counter); | ||
1076 | } | ||
1077 | |||
1078 | static void process_options(int argc, char *argv[]) | ||
1079 | { | ||
1080 | int error = 0, counter; | ||
1081 | |||
1082 | if (strstr(argv[0], "perfstat")) | ||
1083 | run_perfstat = 1; | ||
1084 | |||
1085 | for (;;) { | ||
1086 | int option_index = 0; | ||
1087 | /** Options for getopt */ | ||
1088 | static struct option long_options[] = { | ||
1089 | {"count", required_argument, NULL, 'c'}, | ||
1090 | {"cpu", required_argument, NULL, 'C'}, | ||
1091 | {"delay", required_argument, NULL, 'd'}, | ||
1092 | {"dump_symtab", no_argument, NULL, 'D'}, | ||
1093 | {"event", required_argument, NULL, 'e'}, | ||
1094 | {"filter", required_argument, NULL, 'f'}, | ||
1095 | {"group", required_argument, NULL, 'g'}, | ||
1096 | {"help", no_argument, NULL, 'h'}, | ||
1097 | {"nmi", required_argument, NULL, 'n'}, | ||
1098 | {"mmap_info", no_argument, NULL, 'M'}, | ||
1099 | {"mmap_pages", required_argument, NULL, 'm'}, | ||
1100 | {"munmap_info", no_argument, NULL, 'U'}, | ||
1101 | {"pid", required_argument, NULL, 'p'}, | ||
1102 | {"realtime", required_argument, NULL, 'r'}, | ||
1103 | {"scale", no_argument, NULL, 'l'}, | ||
1104 | {"symbol", required_argument, NULL, 's'}, | ||
1105 | {"stat", no_argument, NULL, 'S'}, | ||
1106 | {"vmlinux", required_argument, NULL, 'x'}, | ||
1107 | {"zero", no_argument, NULL, 'z'}, | ||
1108 | {NULL, 0, NULL, 0 } | ||
1109 | }; | ||
1110 | int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", | ||
1111 | long_options, &option_index); | ||
1112 | if (c == -1) | ||
1113 | break; | ||
1114 | |||
1115 | switch (c) { | ||
1116 | case 'a': system_wide = 1; break; | ||
1117 | case 'c': default_interval = atoi(optarg); break; | ||
1118 | case 'C': | ||
1119 | /* CPU and PID are mutually exclusive */ | ||
1120 | if (tid != -1) { | ||
1121 | printf("WARNING: CPU switch overriding PID\n"); | ||
1122 | sleep(1); | ||
1123 | tid = -1; | ||
1124 | } | ||
1125 | profile_cpu = atoi(optarg); break; | ||
1126 | case 'd': delay_secs = atoi(optarg); break; | ||
1127 | case 'D': dump_symtab = 1; break; | ||
1128 | |||
1129 | case 'e': error = parse_events(optarg); break; | ||
1130 | |||
1131 | case 'f': count_filter = atoi(optarg); break; | ||
1132 | case 'g': group = atoi(optarg); break; | ||
1133 | case 'h': display_help(); break; | ||
1134 | case 'l': scale = 1; break; | ||
1135 | case 'n': nmi = atoi(optarg); break; | ||
1136 | case 'p': | ||
1137 | /* CPU and PID are mutually exclusive */ | ||
1138 | if (profile_cpu != -1) { | ||
1139 | printf("WARNING: PID switch overriding CPU\n"); | ||
1140 | sleep(1); | ||
1141 | profile_cpu = -1; | ||
1142 | } | ||
1143 | tid = atoi(optarg); break; | ||
1144 | case 'r': realtime_prio = atoi(optarg); break; | ||
1145 | case 's': sym_filter = strdup(optarg); break; | ||
1146 | case 'S': run_perfstat = 1; break; | ||
1147 | case 'x': vmlinux = strdup(optarg); break; | ||
1148 | case 'z': zero = 1; break; | ||
1149 | case 'm': mmap_pages = atoi(optarg); break; | ||
1150 | case 'M': use_mmap = 1; break; | ||
1151 | case 'U': use_munmap = 1; break; | ||
1152 | default: error = 1; break; | ||
1153 | } | ||
1154 | } | ||
1155 | if (error) | ||
1156 | display_help(); | ||
1157 | |||
1158 | if (!nr_counters) { | ||
1159 | if (run_perfstat) | ||
1160 | nr_counters = 8; | ||
1161 | else { | ||
1162 | nr_counters = 1; | ||
1163 | event_id[0] = 0; | ||
1164 | } | ||
1165 | } | ||
1166 | |||
1167 | for (counter = 0; counter < nr_counters; counter++) { | ||
1168 | if (event_count[counter]) | ||
1169 | continue; | ||
1170 | |||
1171 | event_count[counter] = default_interval; | ||
1172 | } | ||
1173 | } | ||
1174 | |||
1175 | struct mmap_data { | ||
1176 | int counter; | ||
1177 | void *base; | ||
1178 | unsigned int mask; | ||
1179 | unsigned int prev; | ||
1180 | }; | ||
1181 | |||
1182 | static unsigned int mmap_read_head(struct mmap_data *md) | ||
1183 | { | ||
1184 | struct perf_counter_mmap_page *pc = md->base; | ||
1185 | int head; | ||
1186 | |||
1187 | head = pc->data_head; | ||
1188 | rmb(); | ||
1189 | |||
1190 | return head; | ||
1191 | } | ||
1192 | |||
1193 | struct timeval last_read, this_read; | ||
1194 | |||
1195 | static void mmap_read(struct mmap_data *md) | ||
1196 | { | ||
1197 | unsigned int head = mmap_read_head(md); | ||
1198 | unsigned int old = md->prev; | ||
1199 | unsigned char *data = md->base + page_size; | ||
1200 | int diff; | ||
1201 | |||
1202 | gettimeofday(&this_read, NULL); | ||
1203 | |||
1204 | /* | ||
1205 | * If we're further behind than half the buffer, there's a chance | ||
1206 | * the writer will bite our tail and screw up the events under us. | ||
1207 | * | ||
1208 | * If we somehow ended up ahead of the head, we got messed up. | ||
1209 | * | ||
1210 | * In either case, truncate and restart at head. | ||
1211 | */ | ||
1212 | diff = head - old; | ||
1213 | if (diff > md->mask / 2 || diff < 0) { | ||
1214 | struct timeval iv; | ||
1215 | unsigned long msecs; | ||
1216 | |||
1217 | timersub(&this_read, &last_read, &iv); | ||
1218 | msecs = iv.tv_sec*1000 + iv.tv_usec/1000; | ||
1219 | |||
1220 | fprintf(stderr, "WARNING: failed to keep up with mmap data." | ||
1221 | " Last read %lu msecs ago.\n", msecs); | ||
1222 | |||
1223 | /* | ||
1224 | * head points to a known good entry, start there. | ||
1225 | */ | ||
1226 | old = head; | ||
1227 | } | ||
1228 | |||
1229 | last_read = this_read; | ||
1230 | |||
1231 | for (; old != head;) { | ||
1232 | struct ip_event { | ||
1233 | struct perf_event_header header; | ||
1234 | __u64 ip; | ||
1235 | __u32 pid, tid; | ||
1236 | }; | ||
1237 | struct mmap_event { | ||
1238 | struct perf_event_header header; | ||
1239 | __u32 pid, tid; | ||
1240 | __u64 start; | ||
1241 | __u64 len; | ||
1242 | __u64 pgoff; | ||
1243 | char filename[PATH_MAX]; | ||
1244 | }; | ||
1245 | |||
1246 | typedef union event_union { | ||
1247 | struct perf_event_header header; | ||
1248 | struct ip_event ip; | ||
1249 | struct mmap_event mmap; | ||
1250 | } event_t; | ||
1251 | |||
1252 | event_t *event = (event_t *)&data[old & md->mask]; | ||
1253 | |||
1254 | event_t event_copy; | ||
1255 | |||
1256 | unsigned int size = event->header.size; | ||
1257 | |||
1258 | /* | ||
1259 | * Event straddles the mmap boundary -- header should always | ||
1260 | * be inside due to u64 alignment of output. | ||
1261 | */ | ||
1262 | if ((old & md->mask) + size != ((old + size) & md->mask)) { | ||
1263 | unsigned int offset = old; | ||
1264 | unsigned int len = min(sizeof(*event), size), cpy; | ||
1265 | void *dst = &event_copy; | ||
1266 | |||
1267 | do { | ||
1268 | cpy = min(md->mask + 1 - (offset & md->mask), len); | ||
1269 | memcpy(dst, &data[offset & md->mask], cpy); | ||
1270 | offset += cpy; | ||
1271 | dst += cpy; | ||
1272 | len -= cpy; | ||
1273 | } while (len); | ||
1274 | |||
1275 | event = &event_copy; | ||
1276 | } | ||
1277 | |||
1278 | old += size; | ||
1279 | |||
1280 | switch (event->header.type) { | ||
1281 | case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP: | ||
1282 | case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID: | ||
1283 | process_event(event->ip.ip, md->counter); | ||
1284 | break; | ||
1285 | |||
1286 | case PERF_EVENT_MMAP: | ||
1287 | case PERF_EVENT_MUNMAP: | ||
1288 | printf("%s: %Lu %Lu %Lu %s\n", | ||
1289 | event->header.type == PERF_EVENT_MMAP | ||
1290 | ? "mmap" : "munmap", | ||
1291 | event->mmap.start, | ||
1292 | event->mmap.len, | ||
1293 | event->mmap.pgoff, | ||
1294 | event->mmap.filename); | ||
1295 | break; | ||
1296 | } | ||
1297 | } | ||
1298 | |||
1299 | md->prev = old; | ||
1300 | } | ||
1301 | |||
1302 | int main(int argc, char *argv[]) | ||
1303 | { | ||
1304 | struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; | ||
1305 | struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; | ||
1306 | struct perf_counter_hw_event hw_event; | ||
1307 | pthread_t thread; | ||
1308 | int i, counter, group_fd, nr_poll = 0; | ||
1309 | unsigned int cpu; | ||
1310 | int ret; | ||
1311 | |||
1312 | page_size = sysconf(_SC_PAGE_SIZE); | ||
1313 | |||
1314 | process_options(argc, argv); | ||
1315 | |||
1316 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | ||
1317 | assert(nr_cpus <= MAX_NR_CPUS); | ||
1318 | assert(nr_cpus >= 0); | ||
1319 | |||
1320 | if (run_perfstat) | ||
1321 | return do_perfstat(argc, argv); | ||
1322 | |||
1323 | if (tid != -1 || profile_cpu != -1) | ||
1324 | nr_cpus = 1; | ||
1325 | |||
1326 | parse_symbols(); | ||
1327 | if (vmlinux && sym_filter_entry) | ||
1328 | parse_vmlinux(vmlinux); | ||
1329 | |||
1330 | for (i = 0; i < nr_cpus; i++) { | ||
1331 | group_fd = -1; | ||
1332 | for (counter = 0; counter < nr_counters; counter++) { | ||
1333 | |||
1334 | cpu = profile_cpu; | ||
1335 | if (tid == -1 && profile_cpu == -1) | ||
1336 | cpu = i; | ||
1337 | |||
1338 | memset(&hw_event, 0, sizeof(hw_event)); | ||
1339 | hw_event.config = event_id[counter]; | ||
1340 | hw_event.irq_period = event_count[counter]; | ||
1341 | hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; | ||
1342 | hw_event.nmi = nmi; | ||
1343 | hw_event.mmap = use_mmap; | ||
1344 | hw_event.munmap = use_munmap; | ||
1345 | |||
1346 | fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); | ||
1347 | if (fd[i][counter] < 0) { | ||
1348 | int err = errno; | ||
1349 | printf("kerneltop error: syscall returned with %d (%s)\n", | ||
1350 | fd[i][counter], strerror(err)); | ||
1351 | if (err == EPERM) | ||
1352 | printf("Are you root?\n"); | ||
1353 | exit(-1); | ||
1354 | } | ||
1355 | assert(fd[i][counter] >= 0); | ||
1356 | fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); | ||
1357 | |||
1358 | /* | ||
1359 | * First counter acts as the group leader: | ||
1360 | */ | ||
1361 | if (group && group_fd == -1) | ||
1362 | group_fd = fd[i][counter]; | ||
1363 | |||
1364 | event_array[nr_poll].fd = fd[i][counter]; | ||
1365 | event_array[nr_poll].events = POLLIN; | ||
1366 | nr_poll++; | ||
1367 | |||
1368 | mmap_array[i][counter].counter = counter; | ||
1369 | mmap_array[i][counter].prev = 0; | ||
1370 | mmap_array[i][counter].mask = mmap_pages*page_size - 1; | ||
1371 | mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, | ||
1372 | PROT_READ, MAP_SHARED, fd[i][counter], 0); | ||
1373 | if (mmap_array[i][counter].base == MAP_FAILED) { | ||
1374 | printf("kerneltop error: failed to mmap with %d (%s)\n", | ||
1375 | errno, strerror(errno)); | ||
1376 | exit(-1); | ||
1377 | } | ||
1378 | } | ||
1379 | } | ||
1380 | |||
1381 | if (pthread_create(&thread, NULL, display_thread, NULL)) { | ||
1382 | printf("Could not create display thread.\n"); | ||
1383 | exit(-1); | ||
1384 | } | ||
1385 | |||
1386 | if (realtime_prio) { | ||
1387 | struct sched_param param; | ||
1388 | |||
1389 | param.sched_priority = realtime_prio; | ||
1390 | if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { | ||
1391 | printf("Could not set realtime priority.\n"); | ||
1392 | exit(-1); | ||
1393 | } | ||
1394 | } | ||
1395 | |||
1396 | while (1) { | ||
1397 | int hits = events; | ||
1398 | |||
1399 | for (i = 0; i < nr_cpus; i++) { | ||
1400 | for (counter = 0; counter < nr_counters; counter++) | ||
1401 | mmap_read(&mmap_array[i][counter]); | ||
1402 | } | ||
1403 | |||
1404 | if (hits == events) | ||
1405 | ret = poll(event_array, nr_poll, 100); | ||
1406 | } | ||
1407 | |||
1408 | return 0; | ||
1409 | } | ||
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index b7e034b0a6dd..20a44d0c9fdd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h | |||
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags) | |||
131 | */ | 131 | */ |
132 | struct irq_chip; | 132 | struct irq_chip; |
133 | 133 | ||
134 | #ifdef CONFIG_PERF_COUNTERS | ||
135 | static inline unsigned long test_perf_counter_pending(void) | ||
136 | { | ||
137 | unsigned long x; | ||
138 | |||
139 | asm volatile("lbz %0,%1(13)" | ||
140 | : "=r" (x) | ||
141 | : "i" (offsetof(struct paca_struct, perf_counter_pending))); | ||
142 | return x; | ||
143 | } | ||
144 | |||
145 | static inline void set_perf_counter_pending(void) | ||
146 | { | ||
147 | asm volatile("stb %0,%1(13)" : : | ||
148 | "r" (1), | ||
149 | "i" (offsetof(struct paca_struct, perf_counter_pending))); | ||
150 | } | ||
151 | |||
152 | static inline void clear_perf_counter_pending(void) | ||
153 | { | ||
154 | asm volatile("stb %0,%1(13)" : : | ||
155 | "r" (0), | ||
156 | "i" (offsetof(struct paca_struct, perf_counter_pending))); | ||
157 | } | ||
158 | |||
159 | extern void perf_counter_do_pending(void); | ||
160 | |||
161 | #else | ||
162 | |||
163 | static inline unsigned long test_perf_counter_pending(void) | ||
164 | { | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | static inline void set_perf_counter_pending(void) {} | ||
169 | static inline void clear_perf_counter_pending(void) {} | ||
170 | static inline void perf_counter_do_pending(void) {} | ||
171 | #endif /* CONFIG_PERF_COUNTERS */ | ||
172 | |||
134 | #endif /* __KERNEL__ */ | 173 | #endif /* __KERNEL__ */ |
135 | #endif /* _ASM_POWERPC_HW_IRQ_H */ | 174 | #endif /* _ASM_POWERPC_HW_IRQ_H */ |
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 082b3aedf145..6ef055723019 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h | |||
@@ -99,6 +99,7 @@ struct paca_struct { | |||
99 | u8 soft_enabled; /* irq soft-enable flag */ | 99 | u8 soft_enabled; /* irq soft-enable flag */ |
100 | u8 hard_enabled; /* set if irqs are enabled in MSR */ | 100 | u8 hard_enabled; /* set if irqs are enabled in MSR */ |
101 | u8 io_sync; /* writel() needs spin_unlock sync */ | 101 | u8 io_sync; /* writel() needs spin_unlock sync */ |
102 | u8 perf_counter_pending; /* PM interrupt while soft-disabled */ | ||
102 | 103 | ||
103 | /* Stuff for accurate time accounting */ | 104 | /* Stuff for accurate time accounting */ |
104 | u64 user_time; /* accumulated usermode TB ticks */ | 105 | u64 user_time; /* accumulated usermode TB ticks */ |
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h new file mode 100644 index 000000000000..9d7ff6d7fb56 --- /dev/null +++ b/arch/powerpc/include/asm/perf_counter.h | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * Performance counter support - PowerPC-specific definitions. | ||
3 | * | ||
4 | * Copyright 2008-2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/types.h> | ||
12 | |||
13 | #define MAX_HWCOUNTERS 8 | ||
14 | #define MAX_EVENT_ALTERNATIVES 8 | ||
15 | |||
16 | /* | ||
17 | * This struct provides the constants and functions needed to | ||
18 | * describe the PMU on a particular POWER-family CPU. | ||
19 | */ | ||
20 | struct power_pmu { | ||
21 | int n_counter; | ||
22 | int max_alternatives; | ||
23 | u64 add_fields; | ||
24 | u64 test_adder; | ||
25 | int (*compute_mmcr)(unsigned int events[], int n_ev, | ||
26 | unsigned int hwc[], u64 mmcr[]); | ||
27 | int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); | ||
28 | int (*get_alternatives)(unsigned int event, unsigned int alt[]); | ||
29 | void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); | ||
30 | int n_generic; | ||
31 | int *generic_events; | ||
32 | }; | ||
33 | |||
34 | extern struct power_pmu *ppmu; | ||
35 | |||
36 | /* | ||
37 | * The power_pmu.get_constraint function returns a 64-bit value and | ||
38 | * a 64-bit mask that express the constraints between this event and | ||
39 | * other events. | ||
40 | * | ||
41 | * The value and mask are divided up into (non-overlapping) bitfields | ||
42 | * of three different types: | ||
43 | * | ||
44 | * Select field: this expresses the constraint that some set of bits | ||
45 | * in MMCR* needs to be set to a specific value for this event. For a | ||
46 | * select field, the mask contains 1s in every bit of the field, and | ||
47 | * the value contains a unique value for each possible setting of the | ||
48 | * MMCR* bits. The constraint checking code will ensure that two events | ||
49 | * that set the same field in their masks have the same value in their | ||
50 | * value dwords. | ||
51 | * | ||
52 | * Add field: this expresses the constraint that there can be at most | ||
53 | * N events in a particular class. A field of k bits can be used for | ||
54 | * N <= 2^(k-1) - 1. The mask has the most significant bit of the field | ||
55 | * set (and the other bits 0), and the value has only the least significant | ||
56 | * bit of the field set. In addition, the 'add_fields' and 'test_adder' | ||
57 | * in the struct power_pmu for this processor come into play. The | ||
58 | * add_fields value contains 1 in the LSB of the field, and the | ||
59 | * test_adder contains 2^(k-1) - 1 - N in the field. | ||
60 | * | ||
61 | * NAND field: this expresses the constraint that you may not have events | ||
62 | * in all of a set of classes. (For example, on PPC970, you can't select | ||
63 | * events from the FPU, ISU and IDU simultaneously, although any two are | ||
64 | * possible.) For N classes, the field is N+1 bits wide, and each class | ||
65 | * is assigned one bit from the least-significant N bits. The mask has | ||
66 | * only the most-significant bit set, and the value has only the bit | ||
67 | * for the event's class set. The test_adder has the least significant | ||
68 | * bit set in the field. | ||
69 | * | ||
70 | * If an event is not subject to the constraint expressed by a particular | ||
71 | * field, then it will have 0 in both the mask and value for that field. | ||
72 | */ | ||
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index fe166491e9dc..affa8caed7eb 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h | |||
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1) | |||
322 | SYSCALL_SPU(dup3) | 322 | SYSCALL_SPU(dup3) |
323 | SYSCALL_SPU(pipe2) | 323 | SYSCALL_SPU(pipe2) |
324 | SYSCALL(inotify_init1) | 324 | SYSCALL(inotify_init1) |
325 | SYSCALL_SPU(perf_counter_open) | ||
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index e07d0c76ed77..7cef5afe89d8 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h | |||
@@ -341,10 +341,11 @@ | |||
341 | #define __NR_dup3 316 | 341 | #define __NR_dup3 316 |
342 | #define __NR_pipe2 317 | 342 | #define __NR_pipe2 317 |
343 | #define __NR_inotify_init1 318 | 343 | #define __NR_inotify_init1 318 |
344 | #define __NR_perf_counter_open 319 | ||
344 | 345 | ||
345 | #ifdef __KERNEL__ | 346 | #ifdef __KERNEL__ |
346 | 347 | ||
347 | #define __NR_syscalls 319 | 348 | #define __NR_syscalls 320 |
348 | 349 | ||
349 | #define __NR__exit __NR_exit | 350 | #define __NR__exit __NR_exit |
350 | #define NR_syscalls __NR_syscalls | 351 | #define NR_syscalls __NR_syscalls |
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 71901fbda4a5..9ba1bb731fcc 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile | |||
@@ -94,6 +94,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o | |||
94 | 94 | ||
95 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o | 95 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o |
96 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o | 96 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o |
97 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \ | ||
98 | power5-pmu.o power5+-pmu.o power6-pmu.o | ||
97 | 99 | ||
98 | obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o | 100 | obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o |
99 | 101 | ||
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 1e40bc053946..e981d1ce1914 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c | |||
@@ -131,6 +131,7 @@ int main(void) | |||
131 | DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); | 131 | DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); |
132 | DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); | 132 | DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); |
133 | DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); | 133 | DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); |
134 | DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); | ||
134 | DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); | 135 | DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); |
135 | DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); | 136 | DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); |
136 | DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); | 137 | DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); |
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index abfc32330479..43e073477c34 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S | |||
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) | |||
526 | 2: | 526 | 2: |
527 | TRACE_AND_RESTORE_IRQ(r5); | 527 | TRACE_AND_RESTORE_IRQ(r5); |
528 | 528 | ||
529 | #ifdef CONFIG_PERF_COUNTERS | ||
530 | /* check paca->perf_counter_pending if we're enabling ints */ | ||
531 | lbz r3,PACAPERFPEND(r13) | ||
532 | and. r3,r3,r5 | ||
533 | beq 27f | ||
534 | bl .perf_counter_do_pending | ||
535 | 27: | ||
536 | #endif /* CONFIG_PERF_COUNTERS */ | ||
537 | |||
529 | /* extract EE bit and use it to restore paca->hard_enabled */ | 538 | /* extract EE bit and use it to restore paca->hard_enabled */ |
530 | ld r3,_MSR(r1) | 539 | ld r3,_MSR(r1) |
531 | rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ | 540 | rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ |
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5576147e57b6..2cd471f92fe6 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c | |||
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en) | |||
135 | iseries_handle_interrupts(); | 135 | iseries_handle_interrupts(); |
136 | } | 136 | } |
137 | 137 | ||
138 | if (test_perf_counter_pending()) { | ||
139 | clear_perf_counter_pending(); | ||
140 | perf_counter_do_pending(); | ||
141 | } | ||
142 | |||
138 | /* | 143 | /* |
139 | * if (get_paca()->hard_enabled) return; | 144 | * if (get_paca()->hard_enabled) return; |
140 | * But again we need to take care that gcc gets hard_enabled directly | 145 | * But again we need to take care that gcc gets hard_enabled directly |
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c new file mode 100644 index 000000000000..f88c35d0710a --- /dev/null +++ b/arch/powerpc/kernel/perf_counter.c | |||
@@ -0,0 +1,846 @@ | |||
1 | /* | ||
2 | * Performance counter support - powerpc architecture code | ||
3 | * | ||
4 | * Copyright 2008-2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/perf_counter.h> | ||
14 | #include <linux/percpu.h> | ||
15 | #include <linux/hardirq.h> | ||
16 | #include <asm/reg.h> | ||
17 | #include <asm/pmc.h> | ||
18 | #include <asm/machdep.h> | ||
19 | #include <asm/firmware.h> | ||
20 | |||
21 | struct cpu_hw_counters { | ||
22 | int n_counters; | ||
23 | int n_percpu; | ||
24 | int disabled; | ||
25 | int n_added; | ||
26 | struct perf_counter *counter[MAX_HWCOUNTERS]; | ||
27 | unsigned int events[MAX_HWCOUNTERS]; | ||
28 | u64 mmcr[3]; | ||
29 | u8 pmcs_enabled; | ||
30 | }; | ||
31 | DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); | ||
32 | |||
33 | struct power_pmu *ppmu; | ||
34 | |||
35 | /* | ||
36 | * Normally, to ignore kernel events we set the FCS (freeze counters | ||
37 | * in supervisor mode) bit in MMCR0, but if the kernel runs with the | ||
38 | * hypervisor bit set in the MSR, or if we are running on a processor | ||
39 | * where the hypervisor bit is forced to 1 (as on Apple G5 processors), | ||
40 | * then we need to use the FCHV bit to ignore kernel events. | ||
41 | */ | ||
42 | static unsigned int freeze_counters_kernel = MMCR0_FCS; | ||
43 | |||
44 | static void perf_counter_interrupt(struct pt_regs *regs); | ||
45 | |||
46 | void perf_counter_print_debug(void) | ||
47 | { | ||
48 | } | ||
49 | |||
50 | /* | ||
51 | * Read one performance monitor counter (PMC). | ||
52 | */ | ||
53 | static unsigned long read_pmc(int idx) | ||
54 | { | ||
55 | unsigned long val; | ||
56 | |||
57 | switch (idx) { | ||
58 | case 1: | ||
59 | val = mfspr(SPRN_PMC1); | ||
60 | break; | ||
61 | case 2: | ||
62 | val = mfspr(SPRN_PMC2); | ||
63 | break; | ||
64 | case 3: | ||
65 | val = mfspr(SPRN_PMC3); | ||
66 | break; | ||
67 | case 4: | ||
68 | val = mfspr(SPRN_PMC4); | ||
69 | break; | ||
70 | case 5: | ||
71 | val = mfspr(SPRN_PMC5); | ||
72 | break; | ||
73 | case 6: | ||
74 | val = mfspr(SPRN_PMC6); | ||
75 | break; | ||
76 | case 7: | ||
77 | val = mfspr(SPRN_PMC7); | ||
78 | break; | ||
79 | case 8: | ||
80 | val = mfspr(SPRN_PMC8); | ||
81 | break; | ||
82 | default: | ||
83 | printk(KERN_ERR "oops trying to read PMC%d\n", idx); | ||
84 | val = 0; | ||
85 | } | ||
86 | return val; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * Write one PMC. | ||
91 | */ | ||
92 | static void write_pmc(int idx, unsigned long val) | ||
93 | { | ||
94 | switch (idx) { | ||
95 | case 1: | ||
96 | mtspr(SPRN_PMC1, val); | ||
97 | break; | ||
98 | case 2: | ||
99 | mtspr(SPRN_PMC2, val); | ||
100 | break; | ||
101 | case 3: | ||
102 | mtspr(SPRN_PMC3, val); | ||
103 | break; | ||
104 | case 4: | ||
105 | mtspr(SPRN_PMC4, val); | ||
106 | break; | ||
107 | case 5: | ||
108 | mtspr(SPRN_PMC5, val); | ||
109 | break; | ||
110 | case 6: | ||
111 | mtspr(SPRN_PMC6, val); | ||
112 | break; | ||
113 | case 7: | ||
114 | mtspr(SPRN_PMC7, val); | ||
115 | break; | ||
116 | case 8: | ||
117 | mtspr(SPRN_PMC8, val); | ||
118 | break; | ||
119 | default: | ||
120 | printk(KERN_ERR "oops trying to write PMC%d\n", idx); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Check if a set of events can all go on the PMU at once. | ||
126 | * If they can't, this will look at alternative codes for the events | ||
127 | * and see if any combination of alternative codes is feasible. | ||
128 | * The feasible set is returned in event[]. | ||
129 | */ | ||
130 | static int power_check_constraints(unsigned int event[], int n_ev) | ||
131 | { | ||
132 | u64 mask, value, nv; | ||
133 | unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; | ||
134 | u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; | ||
135 | u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; | ||
136 | u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; | ||
137 | int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; | ||
138 | int i, j; | ||
139 | u64 addf = ppmu->add_fields; | ||
140 | u64 tadd = ppmu->test_adder; | ||
141 | |||
142 | if (n_ev > ppmu->n_counter) | ||
143 | return -1; | ||
144 | |||
145 | /* First see if the events will go on as-is */ | ||
146 | for (i = 0; i < n_ev; ++i) { | ||
147 | alternatives[i][0] = event[i]; | ||
148 | if (ppmu->get_constraint(event[i], &amasks[i][0], | ||
149 | &avalues[i][0])) | ||
150 | return -1; | ||
151 | choice[i] = 0; | ||
152 | } | ||
153 | value = mask = 0; | ||
154 | for (i = 0; i < n_ev; ++i) { | ||
155 | nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf); | ||
156 | if ((((nv + tadd) ^ value) & mask) != 0 || | ||
157 | (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0) | ||
158 | break; | ||
159 | value = nv; | ||
160 | mask |= amasks[i][0]; | ||
161 | } | ||
162 | if (i == n_ev) | ||
163 | return 0; /* all OK */ | ||
164 | |||
165 | /* doesn't work, gather alternatives... */ | ||
166 | if (!ppmu->get_alternatives) | ||
167 | return -1; | ||
168 | for (i = 0; i < n_ev; ++i) { | ||
169 | n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); | ||
170 | for (j = 1; j < n_alt[i]; ++j) | ||
171 | ppmu->get_constraint(alternatives[i][j], | ||
172 | &amasks[i][j], &avalues[i][j]); | ||
173 | } | ||
174 | |||
175 | /* enumerate all possibilities and see if any will work */ | ||
176 | i = 0; | ||
177 | j = -1; | ||
178 | value = mask = nv = 0; | ||
179 | while (i < n_ev) { | ||
180 | if (j >= 0) { | ||
181 | /* we're backtracking, restore context */ | ||
182 | value = svalues[i]; | ||
183 | mask = smasks[i]; | ||
184 | j = choice[i]; | ||
185 | } | ||
186 | /* | ||
187 | * See if any alternative k for event i, | ||
188 | * where k > j, will satisfy the constraints. | ||
189 | */ | ||
190 | while (++j < n_alt[i]) { | ||
191 | nv = (value | avalues[i][j]) + | ||
192 | (value & avalues[i][j] & addf); | ||
193 | if ((((nv + tadd) ^ value) & mask) == 0 && | ||
194 | (((nv + tadd) ^ avalues[i][j]) | ||
195 | & amasks[i][j]) == 0) | ||
196 | break; | ||
197 | } | ||
198 | if (j >= n_alt[i]) { | ||
199 | /* | ||
200 | * No feasible alternative, backtrack | ||
201 | * to event i-1 and continue enumerating its | ||
202 | * alternatives from where we got up to. | ||
203 | */ | ||
204 | if (--i < 0) | ||
205 | return -1; | ||
206 | } else { | ||
207 | /* | ||
208 | * Found a feasible alternative for event i, | ||
209 | * remember where we got up to with this event, | ||
210 | * go on to the next event, and start with | ||
211 | * the first alternative for it. | ||
212 | */ | ||
213 | choice[i] = j; | ||
214 | svalues[i] = value; | ||
215 | smasks[i] = mask; | ||
216 | value = nv; | ||
217 | mask |= amasks[i][j]; | ||
218 | ++i; | ||
219 | j = -1; | ||
220 | } | ||
221 | } | ||
222 | |||
223 | /* OK, we have a feasible combination, tell the caller the solution */ | ||
224 | for (i = 0; i < n_ev; ++i) | ||
225 | event[i] = alternatives[i][choice[i]]; | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Check if newly-added counters have consistent settings for | ||
231 | * exclude_{user,kernel,hv} with each other and any previously | ||
232 | * added counters. | ||
233 | */ | ||
234 | static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) | ||
235 | { | ||
236 | int eu, ek, eh; | ||
237 | int i, n; | ||
238 | struct perf_counter *counter; | ||
239 | |||
240 | n = n_prev + n_new; | ||
241 | if (n <= 1) | ||
242 | return 0; | ||
243 | |||
244 | eu = ctrs[0]->hw_event.exclude_user; | ||
245 | ek = ctrs[0]->hw_event.exclude_kernel; | ||
246 | eh = ctrs[0]->hw_event.exclude_hv; | ||
247 | if (n_prev == 0) | ||
248 | n_prev = 1; | ||
249 | for (i = n_prev; i < n; ++i) { | ||
250 | counter = ctrs[i]; | ||
251 | if (counter->hw_event.exclude_user != eu || | ||
252 | counter->hw_event.exclude_kernel != ek || | ||
253 | counter->hw_event.exclude_hv != eh) | ||
254 | return -EAGAIN; | ||
255 | } | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static void power_perf_read(struct perf_counter *counter) | ||
260 | { | ||
261 | long val, delta, prev; | ||
262 | |||
263 | if (!counter->hw.idx) | ||
264 | return; | ||
265 | /* | ||
266 | * Performance monitor interrupts come even when interrupts | ||
267 | * are soft-disabled, as long as interrupts are hard-enabled. | ||
268 | * Therefore we treat them like NMIs. | ||
269 | */ | ||
270 | do { | ||
271 | prev = atomic64_read(&counter->hw.prev_count); | ||
272 | barrier(); | ||
273 | val = read_pmc(counter->hw.idx); | ||
274 | } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); | ||
275 | |||
276 | /* The counters are only 32 bits wide */ | ||
277 | delta = (val - prev) & 0xfffffffful; | ||
278 | atomic64_add(delta, &counter->count); | ||
279 | atomic64_sub(delta, &counter->hw.period_left); | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * Disable all counters to prevent PMU interrupts and to allow | ||
284 | * counters to be added or removed. | ||
285 | */ | ||
286 | u64 hw_perf_save_disable(void) | ||
287 | { | ||
288 | struct cpu_hw_counters *cpuhw; | ||
289 | unsigned long ret; | ||
290 | unsigned long flags; | ||
291 | |||
292 | local_irq_save(flags); | ||
293 | cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
294 | |||
295 | ret = cpuhw->disabled; | ||
296 | if (!ret) { | ||
297 | cpuhw->disabled = 1; | ||
298 | cpuhw->n_added = 0; | ||
299 | |||
300 | /* | ||
301 | * Check if we ever enabled the PMU on this cpu. | ||
302 | */ | ||
303 | if (!cpuhw->pmcs_enabled) { | ||
304 | if (ppc_md.enable_pmcs) | ||
305 | ppc_md.enable_pmcs(); | ||
306 | cpuhw->pmcs_enabled = 1; | ||
307 | } | ||
308 | |||
309 | /* | ||
310 | * Set the 'freeze counters' bit. | ||
311 | * The barrier is to make sure the mtspr has been | ||
312 | * executed and the PMU has frozen the counters | ||
313 | * before we return. | ||
314 | */ | ||
315 | mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); | ||
316 | mb(); | ||
317 | } | ||
318 | local_irq_restore(flags); | ||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Re-enable all counters if disable == 0. | ||
324 | * If we were previously disabled and counters were added, then | ||
325 | * put the new config on the PMU. | ||
326 | */ | ||
327 | void hw_perf_restore(u64 disable) | ||
328 | { | ||
329 | struct perf_counter *counter; | ||
330 | struct cpu_hw_counters *cpuhw; | ||
331 | unsigned long flags; | ||
332 | long i; | ||
333 | unsigned long val; | ||
334 | s64 left; | ||
335 | unsigned int hwc_index[MAX_HWCOUNTERS]; | ||
336 | |||
337 | if (disable) | ||
338 | return; | ||
339 | local_irq_save(flags); | ||
340 | cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
341 | cpuhw->disabled = 0; | ||
342 | |||
343 | /* | ||
344 | * If we didn't change anything, or only removed counters, | ||
345 | * no need to recalculate MMCR* settings and reset the PMCs. | ||
346 | * Just reenable the PMU with the current MMCR* settings | ||
347 | * (possibly updated for removal of counters). | ||
348 | */ | ||
349 | if (!cpuhw->n_added) { | ||
350 | mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); | ||
351 | mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); | ||
352 | mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); | ||
353 | if (cpuhw->n_counters == 0) | ||
354 | get_lppaca()->pmcregs_in_use = 0; | ||
355 | goto out; | ||
356 | } | ||
357 | |||
358 | /* | ||
359 | * Compute MMCR* values for the new set of counters | ||
360 | */ | ||
361 | if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, | ||
362 | cpuhw->mmcr)) { | ||
363 | /* shouldn't ever get here */ | ||
364 | printk(KERN_ERR "oops compute_mmcr failed\n"); | ||
365 | goto out; | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * Add in MMCR0 freeze bits corresponding to the | ||
370 | * hw_event.exclude_* bits for the first counter. | ||
371 | * We have already checked that all counters have the | ||
372 | * same values for these bits as the first counter. | ||
373 | */ | ||
374 | counter = cpuhw->counter[0]; | ||
375 | if (counter->hw_event.exclude_user) | ||
376 | cpuhw->mmcr[0] |= MMCR0_FCP; | ||
377 | if (counter->hw_event.exclude_kernel) | ||
378 | cpuhw->mmcr[0] |= freeze_counters_kernel; | ||
379 | if (counter->hw_event.exclude_hv) | ||
380 | cpuhw->mmcr[0] |= MMCR0_FCHV; | ||
381 | |||
382 | /* | ||
383 | * Write the new configuration to MMCR* with the freeze | ||
384 | * bit set and set the hardware counters to their initial values. | ||
385 | * Then unfreeze the counters. | ||
386 | */ | ||
387 | get_lppaca()->pmcregs_in_use = 1; | ||
388 | mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); | ||
389 | mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); | ||
390 | mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) | ||
391 | | MMCR0_FC); | ||
392 | |||
393 | /* | ||
394 | * Read off any pre-existing counters that need to move | ||
395 | * to another PMC. | ||
396 | */ | ||
397 | for (i = 0; i < cpuhw->n_counters; ++i) { | ||
398 | counter = cpuhw->counter[i]; | ||
399 | if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { | ||
400 | power_perf_read(counter); | ||
401 | write_pmc(counter->hw.idx, 0); | ||
402 | counter->hw.idx = 0; | ||
403 | } | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Initialize the PMCs for all the new and moved counters. | ||
408 | */ | ||
409 | for (i = 0; i < cpuhw->n_counters; ++i) { | ||
410 | counter = cpuhw->counter[i]; | ||
411 | if (counter->hw.idx) | ||
412 | continue; | ||
413 | val = 0; | ||
414 | if (counter->hw_event.irq_period) { | ||
415 | left = atomic64_read(&counter->hw.period_left); | ||
416 | if (left < 0x80000000L) | ||
417 | val = 0x80000000L - left; | ||
418 | } | ||
419 | atomic64_set(&counter->hw.prev_count, val); | ||
420 | counter->hw.idx = hwc_index[i] + 1; | ||
421 | write_pmc(counter->hw.idx, val); | ||
422 | perf_counter_update_userpage(counter); | ||
423 | } | ||
424 | mb(); | ||
425 | cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; | ||
426 | mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); | ||
427 | |||
428 | out: | ||
429 | local_irq_restore(flags); | ||
430 | } | ||
431 | |||
432 | static int collect_events(struct perf_counter *group, int max_count, | ||
433 | struct perf_counter *ctrs[], unsigned int *events) | ||
434 | { | ||
435 | int n = 0; | ||
436 | struct perf_counter *counter; | ||
437 | |||
438 | if (!is_software_counter(group)) { | ||
439 | if (n >= max_count) | ||
440 | return -1; | ||
441 | ctrs[n] = group; | ||
442 | events[n++] = group->hw.config; | ||
443 | } | ||
444 | list_for_each_entry(counter, &group->sibling_list, list_entry) { | ||
445 | if (!is_software_counter(counter) && | ||
446 | counter->state != PERF_COUNTER_STATE_OFF) { | ||
447 | if (n >= max_count) | ||
448 | return -1; | ||
449 | ctrs[n] = counter; | ||
450 | events[n++] = counter->hw.config; | ||
451 | } | ||
452 | } | ||
453 | return n; | ||
454 | } | ||
455 | |||
456 | static void counter_sched_in(struct perf_counter *counter, int cpu) | ||
457 | { | ||
458 | counter->state = PERF_COUNTER_STATE_ACTIVE; | ||
459 | counter->oncpu = cpu; | ||
460 | counter->tstamp_running += counter->ctx->time_now - | ||
461 | counter->tstamp_stopped; | ||
462 | if (is_software_counter(counter)) | ||
463 | counter->hw_ops->enable(counter); | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Called to enable a whole group of counters. | ||
468 | * Returns 1 if the group was enabled, or -EAGAIN if it could not be. | ||
469 | * Assumes the caller has disabled interrupts and has | ||
470 | * frozen the PMU with hw_perf_save_disable. | ||
471 | */ | ||
472 | int hw_perf_group_sched_in(struct perf_counter *group_leader, | ||
473 | struct perf_cpu_context *cpuctx, | ||
474 | struct perf_counter_context *ctx, int cpu) | ||
475 | { | ||
476 | struct cpu_hw_counters *cpuhw; | ||
477 | long i, n, n0; | ||
478 | struct perf_counter *sub; | ||
479 | |||
480 | cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
481 | n0 = cpuhw->n_counters; | ||
482 | n = collect_events(group_leader, ppmu->n_counter - n0, | ||
483 | &cpuhw->counter[n0], &cpuhw->events[n0]); | ||
484 | if (n < 0) | ||
485 | return -EAGAIN; | ||
486 | if (check_excludes(cpuhw->counter, n0, n)) | ||
487 | return -EAGAIN; | ||
488 | if (power_check_constraints(cpuhw->events, n + n0)) | ||
489 | return -EAGAIN; | ||
490 | cpuhw->n_counters = n0 + n; | ||
491 | cpuhw->n_added += n; | ||
492 | |||
493 | /* | ||
494 | * OK, this group can go on; update counter states etc., | ||
495 | * and enable any software counters | ||
496 | */ | ||
497 | for (i = n0; i < n0 + n; ++i) | ||
498 | cpuhw->counter[i]->hw.config = cpuhw->events[i]; | ||
499 | cpuctx->active_oncpu += n; | ||
500 | n = 1; | ||
501 | counter_sched_in(group_leader, cpu); | ||
502 | list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { | ||
503 | if (sub->state != PERF_COUNTER_STATE_OFF) { | ||
504 | counter_sched_in(sub, cpu); | ||
505 | ++n; | ||
506 | } | ||
507 | } | ||
508 | ctx->nr_active += n; | ||
509 | |||
510 | return 1; | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Add a counter to the PMU. | ||
515 | * If all counters are not already frozen, then we disable and | ||
516 | * re-enable the PMU in order to get hw_perf_restore to do the | ||
517 | * actual work of reconfiguring the PMU. | ||
518 | */ | ||
519 | static int power_perf_enable(struct perf_counter *counter) | ||
520 | { | ||
521 | struct cpu_hw_counters *cpuhw; | ||
522 | unsigned long flags; | ||
523 | u64 pmudis; | ||
524 | int n0; | ||
525 | int ret = -EAGAIN; | ||
526 | |||
527 | local_irq_save(flags); | ||
528 | pmudis = hw_perf_save_disable(); | ||
529 | |||
530 | /* | ||
531 | * Add the counter to the list (if there is room) | ||
532 | * and check whether the total set is still feasible. | ||
533 | */ | ||
534 | cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
535 | n0 = cpuhw->n_counters; | ||
536 | if (n0 >= ppmu->n_counter) | ||
537 | goto out; | ||
538 | cpuhw->counter[n0] = counter; | ||
539 | cpuhw->events[n0] = counter->hw.config; | ||
540 | if (check_excludes(cpuhw->counter, n0, 1)) | ||
541 | goto out; | ||
542 | if (power_check_constraints(cpuhw->events, n0 + 1)) | ||
543 | goto out; | ||
544 | |||
545 | counter->hw.config = cpuhw->events[n0]; | ||
546 | ++cpuhw->n_counters; | ||
547 | ++cpuhw->n_added; | ||
548 | |||
549 | ret = 0; | ||
550 | out: | ||
551 | hw_perf_restore(pmudis); | ||
552 | local_irq_restore(flags); | ||
553 | return ret; | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * Remove a counter from the PMU. | ||
558 | */ | ||
559 | static void power_perf_disable(struct perf_counter *counter) | ||
560 | { | ||
561 | struct cpu_hw_counters *cpuhw; | ||
562 | long i; | ||
563 | u64 pmudis; | ||
564 | unsigned long flags; | ||
565 | |||
566 | local_irq_save(flags); | ||
567 | pmudis = hw_perf_save_disable(); | ||
568 | |||
569 | power_perf_read(counter); | ||
570 | |||
571 | cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
572 | for (i = 0; i < cpuhw->n_counters; ++i) { | ||
573 | if (counter == cpuhw->counter[i]) { | ||
574 | while (++i < cpuhw->n_counters) | ||
575 | cpuhw->counter[i-1] = cpuhw->counter[i]; | ||
576 | --cpuhw->n_counters; | ||
577 | ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); | ||
578 | write_pmc(counter->hw.idx, 0); | ||
579 | counter->hw.idx = 0; | ||
580 | perf_counter_update_userpage(counter); | ||
581 | break; | ||
582 | } | ||
583 | } | ||
584 | if (cpuhw->n_counters == 0) { | ||
585 | /* disable exceptions if no counters are running */ | ||
586 | cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); | ||
587 | } | ||
588 | |||
589 | hw_perf_restore(pmudis); | ||
590 | local_irq_restore(flags); | ||
591 | } | ||
592 | |||
593 | struct hw_perf_counter_ops power_perf_ops = { | ||
594 | .enable = power_perf_enable, | ||
595 | .disable = power_perf_disable, | ||
596 | .read = power_perf_read | ||
597 | }; | ||
598 | |||
599 | /* Number of perf_counters counting hardware events */ | ||
600 | static atomic_t num_counters; | ||
601 | /* Used to avoid races in calling reserve/release_pmc_hardware */ | ||
602 | static DEFINE_MUTEX(pmc_reserve_mutex); | ||
603 | |||
604 | /* | ||
605 | * Release the PMU if this is the last perf_counter. | ||
606 | */ | ||
607 | static void hw_perf_counter_destroy(struct perf_counter *counter) | ||
608 | { | ||
609 | if (!atomic_add_unless(&num_counters, -1, 1)) { | ||
610 | mutex_lock(&pmc_reserve_mutex); | ||
611 | if (atomic_dec_return(&num_counters) == 0) | ||
612 | release_pmc_hardware(); | ||
613 | mutex_unlock(&pmc_reserve_mutex); | ||
614 | } | ||
615 | } | ||
616 | |||
617 | const struct hw_perf_counter_ops * | ||
618 | hw_perf_counter_init(struct perf_counter *counter) | ||
619 | { | ||
620 | unsigned long ev; | ||
621 | struct perf_counter *ctrs[MAX_HWCOUNTERS]; | ||
622 | unsigned int events[MAX_HWCOUNTERS]; | ||
623 | int n; | ||
624 | int err; | ||
625 | |||
626 | if (!ppmu) | ||
627 | return ERR_PTR(-ENXIO); | ||
628 | if ((s64)counter->hw_event.irq_period < 0) | ||
629 | return ERR_PTR(-EINVAL); | ||
630 | if (!perf_event_raw(&counter->hw_event)) { | ||
631 | ev = perf_event_id(&counter->hw_event); | ||
632 | if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) | ||
633 | return ERR_PTR(-EOPNOTSUPP); | ||
634 | ev = ppmu->generic_events[ev]; | ||
635 | } else { | ||
636 | ev = perf_event_config(&counter->hw_event); | ||
637 | } | ||
638 | counter->hw.config_base = ev; | ||
639 | counter->hw.idx = 0; | ||
640 | |||
641 | /* | ||
642 | * If we are not running on a hypervisor, force the | ||
643 | * exclude_hv bit to 0 so that we don't care what | ||
644 | * the user set it to. | ||
645 | */ | ||
646 | if (!firmware_has_feature(FW_FEATURE_LPAR)) | ||
647 | counter->hw_event.exclude_hv = 0; | ||
648 | |||
649 | /* | ||
650 | * If this is in a group, check if it can go on with all the | ||
651 | * other hardware counters in the group. We assume the counter | ||
652 | * hasn't been linked into its leader's sibling list at this point. | ||
653 | */ | ||
654 | n = 0; | ||
655 | if (counter->group_leader != counter) { | ||
656 | n = collect_events(counter->group_leader, ppmu->n_counter - 1, | ||
657 | ctrs, events); | ||
658 | if (n < 0) | ||
659 | return ERR_PTR(-EINVAL); | ||
660 | } | ||
661 | events[n] = ev; | ||
662 | ctrs[n] = counter; | ||
663 | if (check_excludes(ctrs, n, 1)) | ||
664 | return ERR_PTR(-EINVAL); | ||
665 | if (power_check_constraints(events, n + 1)) | ||
666 | return ERR_PTR(-EINVAL); | ||
667 | |||
668 | counter->hw.config = events[n]; | ||
669 | atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); | ||
670 | |||
671 | /* | ||
672 | * See if we need to reserve the PMU. | ||
673 | * If no counters are currently in use, then we have to take a | ||
674 | * mutex to ensure that we don't race with another task doing | ||
675 | * reserve_pmc_hardware or release_pmc_hardware. | ||
676 | */ | ||
677 | err = 0; | ||
678 | if (!atomic_inc_not_zero(&num_counters)) { | ||
679 | mutex_lock(&pmc_reserve_mutex); | ||
680 | if (atomic_read(&num_counters) == 0 && | ||
681 | reserve_pmc_hardware(perf_counter_interrupt)) | ||
682 | err = -EBUSY; | ||
683 | else | ||
684 | atomic_inc(&num_counters); | ||
685 | mutex_unlock(&pmc_reserve_mutex); | ||
686 | } | ||
687 | counter->destroy = hw_perf_counter_destroy; | ||
688 | |||
689 | if (err) | ||
690 | return ERR_PTR(err); | ||
691 | return &power_perf_ops; | ||
692 | } | ||
693 | |||
694 | /* | ||
695 | * A counter has overflowed; update its count and record | ||
696 | * things if requested. Note that interrupts are hard-disabled | ||
697 | * here so there is no possibility of being interrupted. | ||
698 | */ | ||
699 | static void record_and_restart(struct perf_counter *counter, long val, | ||
700 | struct pt_regs *regs) | ||
701 | { | ||
702 | s64 prev, delta, left; | ||
703 | int record = 0; | ||
704 | |||
705 | /* we don't have to worry about interrupts here */ | ||
706 | prev = atomic64_read(&counter->hw.prev_count); | ||
707 | delta = (val - prev) & 0xfffffffful; | ||
708 | atomic64_add(delta, &counter->count); | ||
709 | |||
710 | /* | ||
711 | * See if the total period for this counter has expired, | ||
712 | * and update for the next period. | ||
713 | */ | ||
714 | val = 0; | ||
715 | left = atomic64_read(&counter->hw.period_left) - delta; | ||
716 | if (counter->hw_event.irq_period) { | ||
717 | if (left <= 0) { | ||
718 | left += counter->hw_event.irq_period; | ||
719 | if (left <= 0) | ||
720 | left = counter->hw_event.irq_period; | ||
721 | record = 1; | ||
722 | } | ||
723 | if (left < 0x80000000L) | ||
724 | val = 0x80000000L - left; | ||
725 | } | ||
726 | write_pmc(counter->hw.idx, val); | ||
727 | atomic64_set(&counter->hw.prev_count, val); | ||
728 | atomic64_set(&counter->hw.period_left, left); | ||
729 | perf_counter_update_userpage(counter); | ||
730 | |||
731 | /* | ||
732 | * Finally record data if requested. | ||
733 | */ | ||
734 | if (record) | ||
735 | perf_counter_overflow(counter, 1, regs); | ||
736 | } | ||
737 | |||
738 | /* | ||
739 | * Performance monitor interrupt stuff | ||
740 | */ | ||
741 | static void perf_counter_interrupt(struct pt_regs *regs) | ||
742 | { | ||
743 | int i; | ||
744 | struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); | ||
745 | struct perf_counter *counter; | ||
746 | long val; | ||
747 | int found = 0; | ||
748 | |||
749 | for (i = 0; i < cpuhw->n_counters; ++i) { | ||
750 | counter = cpuhw->counter[i]; | ||
751 | val = read_pmc(counter->hw.idx); | ||
752 | if ((int)val < 0) { | ||
753 | /* counter has overflowed */ | ||
754 | found = 1; | ||
755 | record_and_restart(counter, val, regs); | ||
756 | } | ||
757 | } | ||
758 | |||
759 | /* | ||
760 | * In case we didn't find and reset the counter that caused | ||
761 | * the interrupt, scan all counters and reset any that are | ||
762 | * negative, to avoid getting continual interrupts. | ||
763 | * Any that we processed in the previous loop will not be negative. | ||
764 | */ | ||
765 | if (!found) { | ||
766 | for (i = 0; i < ppmu->n_counter; ++i) { | ||
767 | val = read_pmc(i + 1); | ||
768 | if ((int)val < 0) | ||
769 | write_pmc(i + 1, 0); | ||
770 | } | ||
771 | } | ||
772 | |||
773 | /* | ||
774 | * Reset MMCR0 to its normal value. This will set PMXE and | ||
775 | * clear FC (freeze counters) and PMAO (perf mon alert occurred) | ||
776 | * and thus allow interrupts to occur again. | ||
777 | * XXX might want to use MSR.PM to keep the counters frozen until | ||
778 | * we get back out of this interrupt. | ||
779 | */ | ||
780 | mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); | ||
781 | |||
782 | /* | ||
783 | * If we need a wakeup, check whether interrupts were soft-enabled | ||
784 | * when we took the interrupt. If they were, we can wake stuff up | ||
785 | * immediately; otherwise we'll have do the wakeup when interrupts | ||
786 | * get soft-enabled. | ||
787 | */ | ||
788 | if (test_perf_counter_pending() && regs->softe) { | ||
789 | irq_enter(); | ||
790 | clear_perf_counter_pending(); | ||
791 | perf_counter_do_pending(); | ||
792 | irq_exit(); | ||
793 | } | ||
794 | } | ||
795 | |||
796 | void hw_perf_counter_setup(int cpu) | ||
797 | { | ||
798 | struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); | ||
799 | |||
800 | memset(cpuhw, 0, sizeof(*cpuhw)); | ||
801 | cpuhw->mmcr[0] = MMCR0_FC; | ||
802 | } | ||
803 | |||
804 | extern struct power_pmu power4_pmu; | ||
805 | extern struct power_pmu ppc970_pmu; | ||
806 | extern struct power_pmu power5_pmu; | ||
807 | extern struct power_pmu power5p_pmu; | ||
808 | extern struct power_pmu power6_pmu; | ||
809 | |||
810 | static int init_perf_counters(void) | ||
811 | { | ||
812 | unsigned long pvr; | ||
813 | |||
814 | /* XXX should get this from cputable */ | ||
815 | pvr = mfspr(SPRN_PVR); | ||
816 | switch (PVR_VER(pvr)) { | ||
817 | case PV_POWER4: | ||
818 | case PV_POWER4p: | ||
819 | ppmu = &power4_pmu; | ||
820 | break; | ||
821 | case PV_970: | ||
822 | case PV_970FX: | ||
823 | case PV_970MP: | ||
824 | ppmu = &ppc970_pmu; | ||
825 | break; | ||
826 | case PV_POWER5: | ||
827 | ppmu = &power5_pmu; | ||
828 | break; | ||
829 | case PV_POWER5p: | ||
830 | ppmu = &power5p_pmu; | ||
831 | break; | ||
832 | case 0x3e: | ||
833 | ppmu = &power6_pmu; | ||
834 | break; | ||
835 | } | ||
836 | |||
837 | /* | ||
838 | * Use FCHV to ignore kernel events if MSR.HV is set. | ||
839 | */ | ||
840 | if (mfmsr() & MSR_HV) | ||
841 | freeze_counters_kernel = MMCR0_FCHV; | ||
842 | |||
843 | return 0; | ||
844 | } | ||
845 | |||
846 | arch_initcall(init_perf_counters); | ||
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c new file mode 100644 index 000000000000..1407b19ab619 --- /dev/null +++ b/arch/powerpc/kernel/power4-pmu.c | |||
@@ -0,0 +1,557 @@ | |||
1 | /* | ||
2 | * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. | ||
3 | * | ||
4 | * Copyright 2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/perf_counter.h> | ||
13 | #include <asm/reg.h> | ||
14 | |||
15 | /* | ||
16 | * Bits in event code for POWER4 | ||
17 | */ | ||
18 | #define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ | ||
19 | #define PM_PMC_MSK 0xf | ||
20 | #define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ | ||
21 | #define PM_UNIT_MSK 0xf | ||
22 | #define PM_LOWER_SH 6 | ||
23 | #define PM_LOWER_MSK 1 | ||
24 | #define PM_LOWER_MSKS 0x40 | ||
25 | #define PM_BYTE_SH 4 /* Byte number of event bus to use */ | ||
26 | #define PM_BYTE_MSK 3 | ||
27 | #define PM_PMCSEL_MSK 7 | ||
28 | |||
29 | /* | ||
30 | * Unit code values | ||
31 | */ | ||
32 | #define PM_FPU 1 | ||
33 | #define PM_ISU1 2 | ||
34 | #define PM_IFU 3 | ||
35 | #define PM_IDU0 4 | ||
36 | #define PM_ISU1_ALT 6 | ||
37 | #define PM_ISU2 7 | ||
38 | #define PM_IFU_ALT 8 | ||
39 | #define PM_LSU0 9 | ||
40 | #define PM_LSU1 0xc | ||
41 | #define PM_GPS 0xf | ||
42 | |||
43 | /* | ||
44 | * Bits in MMCR0 for POWER4 | ||
45 | */ | ||
46 | #define MMCR0_PMC1SEL_SH 8 | ||
47 | #define MMCR0_PMC2SEL_SH 1 | ||
48 | #define MMCR_PMCSEL_MSK 0x1f | ||
49 | |||
50 | /* | ||
51 | * Bits in MMCR1 for POWER4 | ||
52 | */ | ||
53 | #define MMCR1_TTM0SEL_SH 62 | ||
54 | #define MMCR1_TTC0SEL_SH 61 | ||
55 | #define MMCR1_TTM1SEL_SH 59 | ||
56 | #define MMCR1_TTC1SEL_SH 58 | ||
57 | #define MMCR1_TTM2SEL_SH 56 | ||
58 | #define MMCR1_TTC2SEL_SH 55 | ||
59 | #define MMCR1_TTM3SEL_SH 53 | ||
60 | #define MMCR1_TTC3SEL_SH 52 | ||
61 | #define MMCR1_TTMSEL_MSK 3 | ||
62 | #define MMCR1_TD_CP_DBG0SEL_SH 50 | ||
63 | #define MMCR1_TD_CP_DBG1SEL_SH 48 | ||
64 | #define MMCR1_TD_CP_DBG2SEL_SH 46 | ||
65 | #define MMCR1_TD_CP_DBG3SEL_SH 44 | ||
66 | #define MMCR1_DEBUG0SEL_SH 43 | ||
67 | #define MMCR1_DEBUG1SEL_SH 42 | ||
68 | #define MMCR1_DEBUG2SEL_SH 41 | ||
69 | #define MMCR1_DEBUG3SEL_SH 40 | ||
70 | #define MMCR1_PMC1_ADDER_SEL_SH 39 | ||
71 | #define MMCR1_PMC2_ADDER_SEL_SH 38 | ||
72 | #define MMCR1_PMC6_ADDER_SEL_SH 37 | ||
73 | #define MMCR1_PMC5_ADDER_SEL_SH 36 | ||
74 | #define MMCR1_PMC8_ADDER_SEL_SH 35 | ||
75 | #define MMCR1_PMC7_ADDER_SEL_SH 34 | ||
76 | #define MMCR1_PMC3_ADDER_SEL_SH 33 | ||
77 | #define MMCR1_PMC4_ADDER_SEL_SH 32 | ||
78 | #define MMCR1_PMC3SEL_SH 27 | ||
79 | #define MMCR1_PMC4SEL_SH 22 | ||
80 | #define MMCR1_PMC5SEL_SH 17 | ||
81 | #define MMCR1_PMC6SEL_SH 12 | ||
82 | #define MMCR1_PMC7SEL_SH 7 | ||
83 | #define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */ | ||
84 | |||
85 | static short mmcr1_adder_bits[8] = { | ||
86 | MMCR1_PMC1_ADDER_SEL_SH, | ||
87 | MMCR1_PMC2_ADDER_SEL_SH, | ||
88 | MMCR1_PMC3_ADDER_SEL_SH, | ||
89 | MMCR1_PMC4_ADDER_SEL_SH, | ||
90 | MMCR1_PMC5_ADDER_SEL_SH, | ||
91 | MMCR1_PMC6_ADDER_SEL_SH, | ||
92 | MMCR1_PMC7_ADDER_SEL_SH, | ||
93 | MMCR1_PMC8_ADDER_SEL_SH | ||
94 | }; | ||
95 | |||
96 | /* | ||
97 | * Bits in MMCRA | ||
98 | */ | ||
99 | #define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */ | ||
100 | |||
101 | /* | ||
102 | * Layout of constraint bits: | ||
103 | * 6666555555555544444444443333333333222222222211111111110000000000 | ||
104 | * 3210987654321098765432109876543210987654321098765432109876543210 | ||
105 | * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><> | ||
106 | * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 | ||
107 | * \SMPL ||\TTC3SEL | ||
108 | * |\TTC_IFU_SEL | ||
109 | * \TTM2SEL0 | ||
110 | * | ||
111 | * SMPL - SAMPLE_ENABLE constraint | ||
112 | * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 | ||
113 | * | ||
114 | * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 | ||
115 | * 55: UC1 error 0x0080_0000_0000_0000 | ||
116 | * 54: FPU events needed 0x0040_0000_0000_0000 | ||
117 | * 53: ISU1 events needed 0x0020_0000_0000_0000 | ||
118 | * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 | ||
119 | * | ||
120 | * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 | ||
121 | * 51: UC2 error 0x0008_0000_0000_0000 | ||
122 | * 50: FPU events needed 0x0004_0000_0000_0000 | ||
123 | * 49: IFU events needed 0x0002_0000_0000_0000 | ||
124 | * 48: LSU0 events needed 0x0001_0000_0000_0000 | ||
125 | * | ||
126 | * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 | ||
127 | * 47: UC3 error 0x8000_0000_0000 | ||
128 | * 46: LSU0 events needed 0x4000_0000_0000 | ||
129 | * 45: IFU events needed 0x2000_0000_0000 | ||
130 | * 44: IDU0|ISU2 events needed 0x1000_0000_0000 | ||
131 | * 43: ISU1 events needed 0x0800_0000_0000 | ||
132 | * | ||
133 | * TTM2SEL0 | ||
134 | * 42: 0 = IDU0 events needed | ||
135 | * 1 = ISU2 events needed 0x0400_0000_0000 | ||
136 | * | ||
137 | * TTC_IFU_SEL | ||
138 | * 41: 0 = IFU.U events needed | ||
139 | * 1 = IFU.L events needed 0x0200_0000_0000 | ||
140 | * | ||
141 | * TTC3SEL | ||
142 | * 40: 0 = LSU1.U events needed | ||
143 | * 1 = LSU1.L events needed 0x0100_0000_0000 | ||
144 | * | ||
145 | * PS1 | ||
146 | * 39: PS1 error 0x0080_0000_0000 | ||
147 | * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 | ||
148 | * | ||
149 | * PS2 | ||
150 | * 35: PS2 error 0x0008_0000_0000 | ||
151 | * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 | ||
152 | * | ||
153 | * B0 | ||
154 | * 28-31: Byte 0 event source 0xf000_0000 | ||
155 | * 1 = FPU | ||
156 | * 2 = ISU1 | ||
157 | * 3 = IFU | ||
158 | * 4 = IDU0 | ||
159 | * 7 = ISU2 | ||
160 | * 9 = LSU0 | ||
161 | * c = LSU1 | ||
162 | * f = GPS | ||
163 | * | ||
164 | * B1, B2, B3 | ||
165 | * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources | ||
166 | * | ||
167 | * P8 | ||
168 | * 15: P8 error 0x8000 | ||
169 | * 14-15: Count of events needing PMC8 | ||
170 | * | ||
171 | * P1..P7 | ||
172 | * 0-13: Count of events needing PMC1..PMC7 | ||
173 | * | ||
174 | * Note: this doesn't allow events using IFU.U to be combined with events | ||
175 | * using IFU.L, though that is feasible (using TTM0 and TTM2). However | ||
176 | * there are no listed events for IFU.L (they are debug events not | ||
177 | * verified for performance monitoring) so this shouldn't cause a | ||
178 | * problem. | ||
179 | */ | ||
180 | |||
181 | static struct unitinfo { | ||
182 | u64 value, mask; | ||
183 | int unit; | ||
184 | int lowerbit; | ||
185 | } p4_unitinfo[16] = { | ||
186 | [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 }, | ||
187 | [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, | ||
188 | [PM_ISU1_ALT] = | ||
189 | { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, | ||
190 | [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, | ||
191 | [PM_IFU_ALT] = | ||
192 | { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, | ||
193 | [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 }, | ||
194 | [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 }, | ||
195 | [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 }, | ||
196 | [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 }, | ||
197 | [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 } | ||
198 | }; | ||
199 | |||
200 | static unsigned char direct_marked_event[8] = { | ||
201 | (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ | ||
202 | (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ | ||
203 | (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */ | ||
204 | (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ | ||
205 | (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */ | ||
206 | (1<<3) | (1<<4) | (1<<5), | ||
207 | /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ | ||
208 | (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ | ||
209 | (1<<4), /* PMC8: PM_MRK_LSU_FIN */ | ||
210 | }; | ||
211 | |||
212 | /* | ||
213 | * Returns 1 if event counts things relating to marked instructions | ||
214 | * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. | ||
215 | */ | ||
216 | static int p4_marked_instr_event(unsigned int event) | ||
217 | { | ||
218 | int pmc, psel, unit, byte, bit; | ||
219 | unsigned int mask; | ||
220 | |||
221 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
222 | psel = event & PM_PMCSEL_MSK; | ||
223 | if (pmc) { | ||
224 | if (direct_marked_event[pmc - 1] & (1 << psel)) | ||
225 | return 1; | ||
226 | if (psel == 0) /* add events */ | ||
227 | bit = (pmc <= 4)? pmc - 1: 8 - pmc; | ||
228 | else if (psel == 6) /* decode events */ | ||
229 | bit = 4; | ||
230 | else | ||
231 | return 0; | ||
232 | } else | ||
233 | bit = psel; | ||
234 | |||
235 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
236 | unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
237 | mask = 0; | ||
238 | switch (unit) { | ||
239 | case PM_LSU1: | ||
240 | if (event & PM_LOWER_MSKS) | ||
241 | mask = 1 << 28; /* byte 7 bit 4 */ | ||
242 | else | ||
243 | mask = 6 << 24; /* byte 3 bits 1 and 2 */ | ||
244 | break; | ||
245 | case PM_LSU0: | ||
246 | /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ | ||
247 | mask = 0x083dff00; | ||
248 | } | ||
249 | return (mask >> (byte * 8 + bit)) & 1; | ||
250 | } | ||
251 | |||
252 | static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) | ||
253 | { | ||
254 | int pmc, byte, unit, lower, sh; | ||
255 | u64 mask = 0, value = 0; | ||
256 | int grp = -1; | ||
257 | |||
258 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
259 | if (pmc) { | ||
260 | if (pmc > 8) | ||
261 | return -1; | ||
262 | sh = (pmc - 1) * 2; | ||
263 | mask |= 2 << sh; | ||
264 | value |= 1 << sh; | ||
265 | grp = ((pmc - 1) >> 1) & 1; | ||
266 | } | ||
267 | unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
268 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
269 | if (unit) { | ||
270 | lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; | ||
271 | |||
272 | /* | ||
273 | * Bus events on bytes 0 and 2 can be counted | ||
274 | * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. | ||
275 | */ | ||
276 | if (!pmc) | ||
277 | grp = byte & 1; | ||
278 | |||
279 | if (!p4_unitinfo[unit].unit) | ||
280 | return -1; | ||
281 | mask |= p4_unitinfo[unit].mask; | ||
282 | value |= p4_unitinfo[unit].value; | ||
283 | sh = p4_unitinfo[unit].lowerbit; | ||
284 | if (sh > 1) | ||
285 | value |= (u64)lower << sh; | ||
286 | else if (lower != sh) | ||
287 | return -1; | ||
288 | unit = p4_unitinfo[unit].unit; | ||
289 | |||
290 | /* Set byte lane select field */ | ||
291 | mask |= 0xfULL << (28 - 4 * byte); | ||
292 | value |= (u64)unit << (28 - 4 * byte); | ||
293 | } | ||
294 | if (grp == 0) { | ||
295 | /* increment PMC1/2/5/6 field */ | ||
296 | mask |= 0x8000000000ull; | ||
297 | value |= 0x1000000000ull; | ||
298 | } else { | ||
299 | /* increment PMC3/4/7/8 field */ | ||
300 | mask |= 0x800000000ull; | ||
301 | value |= 0x100000000ull; | ||
302 | } | ||
303 | |||
304 | /* Marked instruction events need sample_enable set */ | ||
305 | if (p4_marked_instr_event(event)) { | ||
306 | mask |= 1ull << 56; | ||
307 | value |= 1ull << 56; | ||
308 | } | ||
309 | |||
310 | /* PMCSEL=6 decode events on byte 2 need sample_enable clear */ | ||
311 | if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) | ||
312 | mask |= 1ull << 56; | ||
313 | |||
314 | *maskp = mask; | ||
315 | *valp = value; | ||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static unsigned int ppc_inst_cmpl[] = { | ||
320 | 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 | ||
321 | }; | ||
322 | |||
323 | static int p4_get_alternatives(unsigned int event, unsigned int alt[]) | ||
324 | { | ||
325 | int i, j, na; | ||
326 | |||
327 | alt[0] = event; | ||
328 | na = 1; | ||
329 | |||
330 | /* 2 possibilities for PM_GRP_DISP_REJECT */ | ||
331 | if (event == 0x8003 || event == 0x0224) { | ||
332 | alt[1] = event ^ (0x8003 ^ 0x0224); | ||
333 | return 2; | ||
334 | } | ||
335 | |||
336 | /* 2 possibilities for PM_ST_MISS_L1 */ | ||
337 | if (event == 0x0c13 || event == 0x0c23) { | ||
338 | alt[1] = event ^ (0x0c13 ^ 0x0c23); | ||
339 | return 2; | ||
340 | } | ||
341 | |||
342 | /* several possibilities for PM_INST_CMPL */ | ||
343 | for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { | ||
344 | if (event == ppc_inst_cmpl[i]) { | ||
345 | for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) | ||
346 | if (j != i) | ||
347 | alt[na++] = ppc_inst_cmpl[j]; | ||
348 | break; | ||
349 | } | ||
350 | } | ||
351 | |||
352 | return na; | ||
353 | } | ||
354 | |||
355 | static int p4_compute_mmcr(unsigned int event[], int n_ev, | ||
356 | unsigned int hwc[], u64 mmcr[]) | ||
357 | { | ||
358 | u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; | ||
359 | unsigned int pmc, unit, byte, psel, lower; | ||
360 | unsigned int ttm, grp; | ||
361 | unsigned int pmc_inuse = 0; | ||
362 | unsigned int pmc_grp_use[2]; | ||
363 | unsigned char busbyte[4]; | ||
364 | unsigned char unituse[16]; | ||
365 | unsigned int unitlower = 0; | ||
366 | int i; | ||
367 | |||
368 | if (n_ev > 8) | ||
369 | return -1; | ||
370 | |||
371 | /* First pass to count resource use */ | ||
372 | pmc_grp_use[0] = pmc_grp_use[1] = 0; | ||
373 | memset(busbyte, 0, sizeof(busbyte)); | ||
374 | memset(unituse, 0, sizeof(unituse)); | ||
375 | for (i = 0; i < n_ev; ++i) { | ||
376 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
377 | if (pmc) { | ||
378 | if (pmc_inuse & (1 << (pmc - 1))) | ||
379 | return -1; | ||
380 | pmc_inuse |= 1 << (pmc - 1); | ||
381 | /* count 1/2/5/6 vs 3/4/7/8 use */ | ||
382 | ++pmc_grp_use[((pmc - 1) >> 1) & 1]; | ||
383 | } | ||
384 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
385 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
386 | lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; | ||
387 | if (unit) { | ||
388 | if (!pmc) | ||
389 | ++pmc_grp_use[byte & 1]; | ||
390 | if (unit == 6 || unit == 8) | ||
391 | /* map alt ISU1/IFU codes: 6->2, 8->3 */ | ||
392 | unit = (unit >> 1) - 1; | ||
393 | if (busbyte[byte] && busbyte[byte] != unit) | ||
394 | return -1; | ||
395 | busbyte[byte] = unit; | ||
396 | lower <<= unit; | ||
397 | if (unituse[unit] && lower != (unitlower & lower)) | ||
398 | return -1; | ||
399 | unituse[unit] = 1; | ||
400 | unitlower |= lower; | ||
401 | } | ||
402 | } | ||
403 | if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) | ||
404 | return -1; | ||
405 | |||
406 | /* | ||
407 | * Assign resources and set multiplexer selects. | ||
408 | * | ||
409 | * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. | ||
410 | * Each TTMx can only select one unit, but since | ||
411 | * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, | ||
412 | * we have some choices. | ||
413 | */ | ||
414 | if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { | ||
415 | unituse[6] = 1; /* Move 2 to 6 */ | ||
416 | unituse[2] = 0; | ||
417 | } | ||
418 | if (unituse[3] & (unituse[1] | unituse[2])) { | ||
419 | unituse[8] = 1; /* Move 3 to 8 */ | ||
420 | unituse[3] = 0; | ||
421 | unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); | ||
422 | } | ||
423 | /* Check only one unit per TTMx */ | ||
424 | if (unituse[1] + unituse[2] + unituse[3] > 1 || | ||
425 | unituse[4] + unituse[6] + unituse[7] > 1 || | ||
426 | unituse[8] + unituse[9] > 1 || | ||
427 | (unituse[5] | unituse[10] | unituse[11] | | ||
428 | unituse[13] | unituse[14])) | ||
429 | return -1; | ||
430 | |||
431 | /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */ | ||
432 | mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH; | ||
433 | mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH; | ||
434 | mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH; | ||
435 | |||
436 | /* Set TTCxSEL fields. */ | ||
437 | if (unitlower & 0xe) | ||
438 | mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; | ||
439 | if (unitlower & 0xf0) | ||
440 | mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; | ||
441 | if (unitlower & 0xf00) | ||
442 | mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; | ||
443 | if (unitlower & 0x7000) | ||
444 | mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; | ||
445 | |||
446 | /* Set byte lane select fields. */ | ||
447 | for (byte = 0; byte < 4; ++byte) { | ||
448 | unit = busbyte[byte]; | ||
449 | if (!unit) | ||
450 | continue; | ||
451 | if (unit == 0xf) { | ||
452 | /* special case for GPS */ | ||
453 | mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); | ||
454 | } else { | ||
455 | if (!unituse[unit]) | ||
456 | ttm = unit - 1; /* 2->1, 3->2 */ | ||
457 | else | ||
458 | ttm = unit >> 2; | ||
459 | mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte); | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ | ||
464 | for (i = 0; i < n_ev; ++i) { | ||
465 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
466 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
467 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
468 | psel = event[i] & PM_PMCSEL_MSK; | ||
469 | if (!pmc) { | ||
470 | /* Bus event or 00xxx direct event (off or cycles) */ | ||
471 | if (unit) | ||
472 | psel |= 0x10 | ((byte & 2) << 2); | ||
473 | for (pmc = 0; pmc < 8; ++pmc) { | ||
474 | if (pmc_inuse & (1 << pmc)) | ||
475 | continue; | ||
476 | grp = (pmc >> 1) & 1; | ||
477 | if (unit) { | ||
478 | if (grp == (byte & 1)) | ||
479 | break; | ||
480 | } else if (pmc_grp_use[grp] < 4) { | ||
481 | ++pmc_grp_use[grp]; | ||
482 | break; | ||
483 | } | ||
484 | } | ||
485 | pmc_inuse |= 1 << pmc; | ||
486 | } else { | ||
487 | /* Direct event */ | ||
488 | --pmc; | ||
489 | if (psel == 0 && (byte & 2)) | ||
490 | /* add events on higher-numbered bus */ | ||
491 | mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; | ||
492 | else if (psel == 6 && byte == 3) | ||
493 | /* seem to need to set sample_enable here */ | ||
494 | mmcra |= MMCRA_SAMPLE_ENABLE; | ||
495 | psel |= 8; | ||
496 | } | ||
497 | if (pmc <= 1) | ||
498 | mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); | ||
499 | else | ||
500 | mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); | ||
501 | if (pmc == 7) /* PMC8 */ | ||
502 | mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; | ||
503 | hwc[i] = pmc; | ||
504 | if (p4_marked_instr_event(event[i])) | ||
505 | mmcra |= MMCRA_SAMPLE_ENABLE; | ||
506 | } | ||
507 | |||
508 | if (pmc_inuse & 1) | ||
509 | mmcr0 |= MMCR0_PMC1CE; | ||
510 | if (pmc_inuse & 0xfe) | ||
511 | mmcr0 |= MMCR0_PMCjCE; | ||
512 | |||
513 | mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ | ||
514 | |||
515 | /* Return MMCRx values */ | ||
516 | mmcr[0] = mmcr0; | ||
517 | mmcr[1] = mmcr1; | ||
518 | mmcr[2] = mmcra; | ||
519 | return 0; | ||
520 | } | ||
521 | |||
522 | static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) | ||
523 | { | ||
524 | /* | ||
525 | * Setting the PMCxSEL field to 0 disables PMC x. | ||
526 | * (Note that pmc is 0-based here, not 1-based.) | ||
527 | */ | ||
528 | if (pmc <= 1) { | ||
529 | mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); | ||
530 | } else { | ||
531 | mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); | ||
532 | if (pmc == 7) | ||
533 | mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); | ||
534 | } | ||
535 | } | ||
536 | |||
537 | static int p4_generic_events[] = { | ||
538 | [PERF_COUNT_CPU_CYCLES] = 7, | ||
539 | [PERF_COUNT_INSTRUCTIONS] = 0x1001, | ||
540 | [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ | ||
541 | [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ | ||
542 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ | ||
543 | [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ | ||
544 | }; | ||
545 | |||
546 | struct power_pmu power4_pmu = { | ||
547 | .n_counter = 8, | ||
548 | .max_alternatives = 5, | ||
549 | .add_fields = 0x0000001100005555ull, | ||
550 | .test_adder = 0x0011083300000000ull, | ||
551 | .compute_mmcr = p4_compute_mmcr, | ||
552 | .get_constraint = p4_get_constraint, | ||
553 | .get_alternatives = p4_get_alternatives, | ||
554 | .disable_pmc = p4_disable_pmc, | ||
555 | .n_generic = ARRAY_SIZE(p4_generic_events), | ||
556 | .generic_events = p4_generic_events, | ||
557 | }; | ||
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c new file mode 100644 index 000000000000..cec21ea65b0e --- /dev/null +++ b/arch/powerpc/kernel/power5+-pmu.c | |||
@@ -0,0 +1,452 @@ | |||
1 | /* | ||
2 | * Performance counter support for POWER5 (not POWER5++) processors. | ||
3 | * | ||
4 | * Copyright 2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/perf_counter.h> | ||
13 | #include <asm/reg.h> | ||
14 | |||
15 | /* | ||
16 | * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) | ||
17 | */ | ||
18 | #define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ | ||
19 | #define PM_PMC_MSK 0xf | ||
20 | #define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) | ||
21 | #define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ | ||
22 | #define PM_UNIT_MSK 0xf | ||
23 | #define PM_BYTE_SH 12 /* Byte number of event bus to use */ | ||
24 | #define PM_BYTE_MSK 7 | ||
25 | #define PM_GRS_SH 8 /* Storage subsystem mux select */ | ||
26 | #define PM_GRS_MSK 7 | ||
27 | #define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ | ||
28 | #define PM_PMCSEL_MSK 0x7f | ||
29 | |||
30 | /* Values in PM_UNIT field */ | ||
31 | #define PM_FPU 0 | ||
32 | #define PM_ISU0 1 | ||
33 | #define PM_IFU 2 | ||
34 | #define PM_ISU1 3 | ||
35 | #define PM_IDU 4 | ||
36 | #define PM_ISU0_ALT 6 | ||
37 | #define PM_GRS 7 | ||
38 | #define PM_LSU0 8 | ||
39 | #define PM_LSU1 0xc | ||
40 | #define PM_LASTUNIT 0xc | ||
41 | |||
42 | /* | ||
43 | * Bits in MMCR1 for POWER5+ | ||
44 | */ | ||
45 | #define MMCR1_TTM0SEL_SH 62 | ||
46 | #define MMCR1_TTM1SEL_SH 60 | ||
47 | #define MMCR1_TTM2SEL_SH 58 | ||
48 | #define MMCR1_TTM3SEL_SH 56 | ||
49 | #define MMCR1_TTMSEL_MSK 3 | ||
50 | #define MMCR1_TD_CP_DBG0SEL_SH 54 | ||
51 | #define MMCR1_TD_CP_DBG1SEL_SH 52 | ||
52 | #define MMCR1_TD_CP_DBG2SEL_SH 50 | ||
53 | #define MMCR1_TD_CP_DBG3SEL_SH 48 | ||
54 | #define MMCR1_GRS_L2SEL_SH 46 | ||
55 | #define MMCR1_GRS_L2SEL_MSK 3 | ||
56 | #define MMCR1_GRS_L3SEL_SH 44 | ||
57 | #define MMCR1_GRS_L3SEL_MSK 3 | ||
58 | #define MMCR1_GRS_MCSEL_SH 41 | ||
59 | #define MMCR1_GRS_MCSEL_MSK 7 | ||
60 | #define MMCR1_GRS_FABSEL_SH 39 | ||
61 | #define MMCR1_GRS_FABSEL_MSK 3 | ||
62 | #define MMCR1_PMC1_ADDER_SEL_SH 35 | ||
63 | #define MMCR1_PMC2_ADDER_SEL_SH 34 | ||
64 | #define MMCR1_PMC3_ADDER_SEL_SH 33 | ||
65 | #define MMCR1_PMC4_ADDER_SEL_SH 32 | ||
66 | #define MMCR1_PMC1SEL_SH 25 | ||
67 | #define MMCR1_PMC2SEL_SH 17 | ||
68 | #define MMCR1_PMC3SEL_SH 9 | ||
69 | #define MMCR1_PMC4SEL_SH 1 | ||
70 | #define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) | ||
71 | #define MMCR1_PMCSEL_MSK 0x7f | ||
72 | |||
73 | /* | ||
74 | * Bits in MMCRA | ||
75 | */ | ||
76 | |||
77 | /* | ||
78 | * Layout of constraint bits: | ||
79 | * 6666555555555544444444443333333333222222222211111111110000000000 | ||
80 | * 3210987654321098765432109876543210987654321098765432109876543210 | ||
81 | * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> | ||
82 | * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 | ||
83 | * | ||
84 | * NC - number of counters | ||
85 | * 51: NC error 0x0008_0000_0000_0000 | ||
86 | * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 | ||
87 | * | ||
88 | * G0..G3 - GRS mux constraints | ||
89 | * 46-47: GRS_L2SEL value | ||
90 | * 44-45: GRS_L3SEL value | ||
91 | * 41-44: GRS_MCSEL value | ||
92 | * 39-40: GRS_FABSEL value | ||
93 | * Note that these match up with their bit positions in MMCR1 | ||
94 | * | ||
95 | * T0 - TTM0 constraint | ||
96 | * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 | ||
97 | * | ||
98 | * T1 - TTM1 constraint | ||
99 | * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 | ||
100 | * | ||
101 | * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS | ||
102 | * 33: UC3 error 0x02_0000_0000 | ||
103 | * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000 | ||
104 | * 31: ISU0 events needed 0x01_8000_0000 | ||
105 | * 30: IDU|GRS events needed 0x00_4000_0000 | ||
106 | * | ||
107 | * B0 | ||
108 | * 20-23: Byte 0 event source 0x00f0_0000 | ||
109 | * Encoding as for the event code | ||
110 | * | ||
111 | * B1, B2, B3 | ||
112 | * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources | ||
113 | * | ||
114 | * P4 | ||
115 | * 7: P1 error 0x80 | ||
116 | * 6-7: Count of events needing PMC4 | ||
117 | * | ||
118 | * P1..P3 | ||
119 | * 0-6: Count of events needing PMC1..PMC3 | ||
120 | */ | ||
121 | |||
122 | static const int grsel_shift[8] = { | ||
123 | MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, | ||
124 | MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, | ||
125 | MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH | ||
126 | }; | ||
127 | |||
128 | /* Masks and values for using events from the various units */ | ||
129 | static u64 unit_cons[PM_LASTUNIT+1][2] = { | ||
130 | [PM_FPU] = { 0x3200000000ull, 0x0100000000ull }, | ||
131 | [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull }, | ||
132 | [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull }, | ||
133 | [PM_IFU] = { 0x3200000000ull, 0x2100000000ull }, | ||
134 | [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull }, | ||
135 | [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, | ||
136 | }; | ||
137 | |||
138 | static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) | ||
139 | { | ||
140 | int pmc, byte, unit, sh; | ||
141 | int bit, fmask; | ||
142 | u64 mask = 0, value = 0; | ||
143 | |||
144 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
145 | if (pmc) { | ||
146 | if (pmc > 4) | ||
147 | return -1; | ||
148 | sh = (pmc - 1) * 2; | ||
149 | mask |= 2 << sh; | ||
150 | value |= 1 << sh; | ||
151 | } | ||
152 | if (event & PM_BUSEVENT_MSK) { | ||
153 | unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
154 | if (unit > PM_LASTUNIT) | ||
155 | return -1; | ||
156 | if (unit == PM_ISU0_ALT) | ||
157 | unit = PM_ISU0; | ||
158 | mask |= unit_cons[unit][0]; | ||
159 | value |= unit_cons[unit][1]; | ||
160 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
161 | if (byte >= 4) { | ||
162 | if (unit != PM_LSU1) | ||
163 | return -1; | ||
164 | /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ | ||
165 | ++unit; | ||
166 | byte &= 3; | ||
167 | } | ||
168 | if (unit == PM_GRS) { | ||
169 | bit = event & 7; | ||
170 | fmask = (bit == 6)? 7: 3; | ||
171 | sh = grsel_shift[bit]; | ||
172 | mask |= (u64)fmask << sh; | ||
173 | value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; | ||
174 | } | ||
175 | /* Set byte lane select field */ | ||
176 | mask |= 0xfULL << (20 - 4 * byte); | ||
177 | value |= (u64)unit << (20 - 4 * byte); | ||
178 | } | ||
179 | mask |= 0x8000000000000ull; | ||
180 | value |= 0x1000000000000ull; | ||
181 | *maskp = mask; | ||
182 | *valp = value; | ||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | #define MAX_ALT 3 /* at most 3 alternatives for any event */ | ||
187 | |||
188 | static const unsigned int event_alternatives[][MAX_ALT] = { | ||
189 | { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */ | ||
190 | { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ | ||
191 | { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */ | ||
192 | { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */ | ||
193 | { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ | ||
194 | { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ | ||
195 | { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ | ||
196 | { 0x100009, 0x200009 }, /* PM_INST_CMPL */ | ||
197 | { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ | ||
198 | { 0x300009, 0x400009 }, /* PM_INST_DISP */ | ||
199 | }; | ||
200 | |||
201 | /* | ||
202 | * Scan the alternatives table for a match and return the | ||
203 | * index into the alternatives table if found, else -1. | ||
204 | */ | ||
205 | static int find_alternative(unsigned int event) | ||
206 | { | ||
207 | int i, j; | ||
208 | |||
209 | for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { | ||
210 | if (event < event_alternatives[i][0]) | ||
211 | break; | ||
212 | for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) | ||
213 | if (event == event_alternatives[i][j]) | ||
214 | return i; | ||
215 | } | ||
216 | return -1; | ||
217 | } | ||
218 | |||
219 | static const unsigned char bytedecode_alternatives[4][4] = { | ||
220 | /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, | ||
221 | /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, | ||
222 | /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, | ||
223 | /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } | ||
224 | }; | ||
225 | |||
226 | /* | ||
227 | * Some direct events for decodes of event bus byte 3 have alternative | ||
228 | * PMCSEL values on other counters. This returns the alternative | ||
229 | * event code for those that do, or -1 otherwise. This also handles | ||
230 | * alternative PCMSEL values for add events. | ||
231 | */ | ||
232 | static int find_alternative_bdecode(unsigned int event) | ||
233 | { | ||
234 | int pmc, altpmc, pp, j; | ||
235 | |||
236 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
237 | if (pmc == 0 || pmc > 4) | ||
238 | return -1; | ||
239 | altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ | ||
240 | pp = event & PM_PMCSEL_MSK; | ||
241 | for (j = 0; j < 4; ++j) { | ||
242 | if (bytedecode_alternatives[pmc - 1][j] == pp) { | ||
243 | return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | | ||
244 | (altpmc << PM_PMC_SH) | | ||
245 | bytedecode_alternatives[altpmc - 1][j]; | ||
246 | } | ||
247 | } | ||
248 | |||
249 | /* new decode alternatives for power5+ */ | ||
250 | if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) | ||
251 | return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); | ||
252 | if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) | ||
253 | return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); | ||
254 | |||
255 | /* alternative add event encodings */ | ||
256 | if (pp == 0x10 || pp == 0x28) | ||
257 | return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | | ||
258 | (altpmc << PM_PMC_SH); | ||
259 | |||
260 | return -1; | ||
261 | } | ||
262 | |||
263 | static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) | ||
264 | { | ||
265 | int i, j, ae, nalt = 1; | ||
266 | |||
267 | alt[0] = event; | ||
268 | nalt = 1; | ||
269 | i = find_alternative(event); | ||
270 | if (i >= 0) { | ||
271 | for (j = 0; j < MAX_ALT; ++j) { | ||
272 | ae = event_alternatives[i][j]; | ||
273 | if (ae && ae != event) | ||
274 | alt[nalt++] = ae; | ||
275 | } | ||
276 | } else { | ||
277 | ae = find_alternative_bdecode(event); | ||
278 | if (ae > 0) | ||
279 | alt[nalt++] = ae; | ||
280 | } | ||
281 | return nalt; | ||
282 | } | ||
283 | |||
284 | static int power5p_compute_mmcr(unsigned int event[], int n_ev, | ||
285 | unsigned int hwc[], u64 mmcr[]) | ||
286 | { | ||
287 | u64 mmcr1 = 0; | ||
288 | unsigned int pmc, unit, byte, psel; | ||
289 | unsigned int ttm; | ||
290 | int i, isbus, bit, grsel; | ||
291 | unsigned int pmc_inuse = 0; | ||
292 | unsigned char busbyte[4]; | ||
293 | unsigned char unituse[16]; | ||
294 | int ttmuse; | ||
295 | |||
296 | if (n_ev > 4) | ||
297 | return -1; | ||
298 | |||
299 | /* First pass to count resource use */ | ||
300 | memset(busbyte, 0, sizeof(busbyte)); | ||
301 | memset(unituse, 0, sizeof(unituse)); | ||
302 | for (i = 0; i < n_ev; ++i) { | ||
303 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
304 | if (pmc) { | ||
305 | if (pmc > 4) | ||
306 | return -1; | ||
307 | if (pmc_inuse & (1 << (pmc - 1))) | ||
308 | return -1; | ||
309 | pmc_inuse |= 1 << (pmc - 1); | ||
310 | } | ||
311 | if (event[i] & PM_BUSEVENT_MSK) { | ||
312 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
313 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
314 | if (unit > PM_LASTUNIT) | ||
315 | return -1; | ||
316 | if (unit == PM_ISU0_ALT) | ||
317 | unit = PM_ISU0; | ||
318 | if (byte >= 4) { | ||
319 | if (unit != PM_LSU1) | ||
320 | return -1; | ||
321 | ++unit; | ||
322 | byte &= 3; | ||
323 | } | ||
324 | if (busbyte[byte] && busbyte[byte] != unit) | ||
325 | return -1; | ||
326 | busbyte[byte] = unit; | ||
327 | unituse[unit] = 1; | ||
328 | } | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Assign resources and set multiplexer selects. | ||
333 | * | ||
334 | * PM_ISU0 can go either on TTM0 or TTM1, but that's the only | ||
335 | * choice we have to deal with. | ||
336 | */ | ||
337 | if (unituse[PM_ISU0] & | ||
338 | (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { | ||
339 | unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ | ||
340 | unituse[PM_ISU0] = 0; | ||
341 | } | ||
342 | /* Set TTM[01]SEL fields. */ | ||
343 | ttmuse = 0; | ||
344 | for (i = PM_FPU; i <= PM_ISU1; ++i) { | ||
345 | if (!unituse[i]) | ||
346 | continue; | ||
347 | if (ttmuse++) | ||
348 | return -1; | ||
349 | mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; | ||
350 | } | ||
351 | ttmuse = 0; | ||
352 | for (; i <= PM_GRS; ++i) { | ||
353 | if (!unituse[i]) | ||
354 | continue; | ||
355 | if (ttmuse++) | ||
356 | return -1; | ||
357 | mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; | ||
358 | } | ||
359 | if (ttmuse > 1) | ||
360 | return -1; | ||
361 | |||
362 | /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ | ||
363 | for (byte = 0; byte < 4; ++byte) { | ||
364 | unit = busbyte[byte]; | ||
365 | if (!unit) | ||
366 | continue; | ||
367 | if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { | ||
368 | /* get ISU0 through TTM1 rather than TTM0 */ | ||
369 | unit = PM_ISU0_ALT; | ||
370 | } else if (unit == PM_LSU1 + 1) { | ||
371 | /* select lower word of LSU1 for this byte */ | ||
372 | mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); | ||
373 | } | ||
374 | ttm = unit >> 2; | ||
375 | mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); | ||
376 | } | ||
377 | |||
378 | /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ | ||
379 | for (i = 0; i < n_ev; ++i) { | ||
380 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
381 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
382 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
383 | psel = event[i] & PM_PMCSEL_MSK; | ||
384 | isbus = event[i] & PM_BUSEVENT_MSK; | ||
385 | if (!pmc) { | ||
386 | /* Bus event or any-PMC direct event */ | ||
387 | for (pmc = 0; pmc < 4; ++pmc) { | ||
388 | if (!(pmc_inuse & (1 << pmc))) | ||
389 | break; | ||
390 | } | ||
391 | if (pmc >= 4) | ||
392 | return -1; | ||
393 | pmc_inuse |= 1 << pmc; | ||
394 | } else { | ||
395 | /* Direct event */ | ||
396 | --pmc; | ||
397 | if (isbus && (byte & 2) && | ||
398 | (psel == 8 || psel == 0x10 || psel == 0x28)) | ||
399 | /* add events on higher-numbered bus */ | ||
400 | mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); | ||
401 | } | ||
402 | if (isbus && unit == PM_GRS) { | ||
403 | bit = psel & 7; | ||
404 | grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; | ||
405 | mmcr1 |= (u64)grsel << grsel_shift[bit]; | ||
406 | } | ||
407 | if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) | ||
408 | /* select alternate byte lane */ | ||
409 | psel |= 0x10; | ||
410 | if (pmc <= 3) | ||
411 | mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); | ||
412 | hwc[i] = pmc; | ||
413 | } | ||
414 | |||
415 | /* Return MMCRx values */ | ||
416 | mmcr[0] = 0; | ||
417 | if (pmc_inuse & 1) | ||
418 | mmcr[0] = MMCR0_PMC1CE; | ||
419 | if (pmc_inuse & 0x3e) | ||
420 | mmcr[0] |= MMCR0_PMCjCE; | ||
421 | mmcr[1] = mmcr1; | ||
422 | mmcr[2] = 0; | ||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) | ||
427 | { | ||
428 | if (pmc <= 3) | ||
429 | mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); | ||
430 | } | ||
431 | |||
432 | static int power5p_generic_events[] = { | ||
433 | [PERF_COUNT_CPU_CYCLES] = 0xf, | ||
434 | [PERF_COUNT_INSTRUCTIONS] = 0x100009, | ||
435 | [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ | ||
436 | [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ | ||
437 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ | ||
438 | [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ | ||
439 | }; | ||
440 | |||
441 | struct power_pmu power5p_pmu = { | ||
442 | .n_counter = 4, | ||
443 | .max_alternatives = MAX_ALT, | ||
444 | .add_fields = 0x7000000000055ull, | ||
445 | .test_adder = 0x3000040000000ull, | ||
446 | .compute_mmcr = power5p_compute_mmcr, | ||
447 | .get_constraint = power5p_get_constraint, | ||
448 | .get_alternatives = power5p_get_alternatives, | ||
449 | .disable_pmc = power5p_disable_pmc, | ||
450 | .n_generic = ARRAY_SIZE(power5p_generic_events), | ||
451 | .generic_events = power5p_generic_events, | ||
452 | }; | ||
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c new file mode 100644 index 000000000000..379ed1087cca --- /dev/null +++ b/arch/powerpc/kernel/power5-pmu.c | |||
@@ -0,0 +1,475 @@ | |||
1 | /* | ||
2 | * Performance counter support for POWER5 (not POWER5++) processors. | ||
3 | * | ||
4 | * Copyright 2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/perf_counter.h> | ||
13 | #include <asm/reg.h> | ||
14 | |||
15 | /* | ||
16 | * Bits in event code for POWER5 (not POWER5++) | ||
17 | */ | ||
18 | #define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ | ||
19 | #define PM_PMC_MSK 0xf | ||
20 | #define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) | ||
21 | #define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ | ||
22 | #define PM_UNIT_MSK 0xf | ||
23 | #define PM_BYTE_SH 12 /* Byte number of event bus to use */ | ||
24 | #define PM_BYTE_MSK 7 | ||
25 | #define PM_GRS_SH 8 /* Storage subsystem mux select */ | ||
26 | #define PM_GRS_MSK 7 | ||
27 | #define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ | ||
28 | #define PM_PMCSEL_MSK 0x7f | ||
29 | |||
30 | /* Values in PM_UNIT field */ | ||
31 | #define PM_FPU 0 | ||
32 | #define PM_ISU0 1 | ||
33 | #define PM_IFU 2 | ||
34 | #define PM_ISU1 3 | ||
35 | #define PM_IDU 4 | ||
36 | #define PM_ISU0_ALT 6 | ||
37 | #define PM_GRS 7 | ||
38 | #define PM_LSU0 8 | ||
39 | #define PM_LSU1 0xc | ||
40 | #define PM_LASTUNIT 0xc | ||
41 | |||
42 | /* | ||
43 | * Bits in MMCR1 for POWER5 | ||
44 | */ | ||
45 | #define MMCR1_TTM0SEL_SH 62 | ||
46 | #define MMCR1_TTM1SEL_SH 60 | ||
47 | #define MMCR1_TTM2SEL_SH 58 | ||
48 | #define MMCR1_TTM3SEL_SH 56 | ||
49 | #define MMCR1_TTMSEL_MSK 3 | ||
50 | #define MMCR1_TD_CP_DBG0SEL_SH 54 | ||
51 | #define MMCR1_TD_CP_DBG1SEL_SH 52 | ||
52 | #define MMCR1_TD_CP_DBG2SEL_SH 50 | ||
53 | #define MMCR1_TD_CP_DBG3SEL_SH 48 | ||
54 | #define MMCR1_GRS_L2SEL_SH 46 | ||
55 | #define MMCR1_GRS_L2SEL_MSK 3 | ||
56 | #define MMCR1_GRS_L3SEL_SH 44 | ||
57 | #define MMCR1_GRS_L3SEL_MSK 3 | ||
58 | #define MMCR1_GRS_MCSEL_SH 41 | ||
59 | #define MMCR1_GRS_MCSEL_MSK 7 | ||
60 | #define MMCR1_GRS_FABSEL_SH 39 | ||
61 | #define MMCR1_GRS_FABSEL_MSK 3 | ||
62 | #define MMCR1_PMC1_ADDER_SEL_SH 35 | ||
63 | #define MMCR1_PMC2_ADDER_SEL_SH 34 | ||
64 | #define MMCR1_PMC3_ADDER_SEL_SH 33 | ||
65 | #define MMCR1_PMC4_ADDER_SEL_SH 32 | ||
66 | #define MMCR1_PMC1SEL_SH 25 | ||
67 | #define MMCR1_PMC2SEL_SH 17 | ||
68 | #define MMCR1_PMC3SEL_SH 9 | ||
69 | #define MMCR1_PMC4SEL_SH 1 | ||
70 | #define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) | ||
71 | #define MMCR1_PMCSEL_MSK 0x7f | ||
72 | |||
73 | /* | ||
74 | * Bits in MMCRA | ||
75 | */ | ||
76 | |||
77 | /* | ||
78 | * Layout of constraint bits: | ||
79 | * 6666555555555544444444443333333333222222222211111111110000000000 | ||
80 | * 3210987654321098765432109876543210987654321098765432109876543210 | ||
81 | * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><> | ||
82 | * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1 | ||
83 | * | ||
84 | * T0 - TTM0 constraint | ||
85 | * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 | ||
86 | * | ||
87 | * T1 - TTM1 constraint | ||
88 | * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 | ||
89 | * | ||
90 | * NC - number of counters | ||
91 | * 51: NC error 0x0008_0000_0000_0000 | ||
92 | * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 | ||
93 | * | ||
94 | * G0..G3 - GRS mux constraints | ||
95 | * 46-47: GRS_L2SEL value | ||
96 | * 44-45: GRS_L3SEL value | ||
97 | * 41-44: GRS_MCSEL value | ||
98 | * 39-40: GRS_FABSEL value | ||
99 | * Note that these match up with their bit positions in MMCR1 | ||
100 | * | ||
101 | * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS | ||
102 | * 37: UC3 error 0x20_0000_0000 | ||
103 | * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000 | ||
104 | * 35: ISU0 events needed 0x08_0000_0000 | ||
105 | * 34: IDU|GRS events needed 0x04_0000_0000 | ||
106 | * | ||
107 | * PS1 | ||
108 | * 33: PS1 error 0x2_0000_0000 | ||
109 | * 31-32: count of events needing PMC1/2 0x1_8000_0000 | ||
110 | * | ||
111 | * PS2 | ||
112 | * 30: PS2 error 0x4000_0000 | ||
113 | * 28-29: count of events needing PMC3/4 0x3000_0000 | ||
114 | * | ||
115 | * B0 | ||
116 | * 24-27: Byte 0 event source 0x0f00_0000 | ||
117 | * Encoding as for the event code | ||
118 | * | ||
119 | * B1, B2, B3 | ||
120 | * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources | ||
121 | * | ||
122 | * P1..P6 | ||
123 | * 0-11: Count of events needing PMC1..PMC6 | ||
124 | */ | ||
125 | |||
126 | static const int grsel_shift[8] = { | ||
127 | MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, | ||
128 | MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, | ||
129 | MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH | ||
130 | }; | ||
131 | |||
132 | /* Masks and values for using events from the various units */ | ||
133 | static u64 unit_cons[PM_LASTUNIT+1][2] = { | ||
134 | [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull }, | ||
135 | [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull }, | ||
136 | [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull }, | ||
137 | [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull }, | ||
138 | [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull }, | ||
139 | [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, | ||
140 | }; | ||
141 | |||
142 | static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) | ||
143 | { | ||
144 | int pmc, byte, unit, sh; | ||
145 | int bit, fmask; | ||
146 | u64 mask = 0, value = 0; | ||
147 | int grp = -1; | ||
148 | |||
149 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
150 | if (pmc) { | ||
151 | if (pmc > 6) | ||
152 | return -1; | ||
153 | sh = (pmc - 1) * 2; | ||
154 | mask |= 2 << sh; | ||
155 | value |= 1 << sh; | ||
156 | if (pmc <= 4) | ||
157 | grp = (pmc - 1) >> 1; | ||
158 | else if (event != 0x500009 && event != 0x600005) | ||
159 | return -1; | ||
160 | } | ||
161 | if (event & PM_BUSEVENT_MSK) { | ||
162 | unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
163 | if (unit > PM_LASTUNIT) | ||
164 | return -1; | ||
165 | if (unit == PM_ISU0_ALT) | ||
166 | unit = PM_ISU0; | ||
167 | mask |= unit_cons[unit][0]; | ||
168 | value |= unit_cons[unit][1]; | ||
169 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
170 | if (byte >= 4) { | ||
171 | if (unit != PM_LSU1) | ||
172 | return -1; | ||
173 | /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ | ||
174 | ++unit; | ||
175 | byte &= 3; | ||
176 | } | ||
177 | if (unit == PM_GRS) { | ||
178 | bit = event & 7; | ||
179 | fmask = (bit == 6)? 7: 3; | ||
180 | sh = grsel_shift[bit]; | ||
181 | mask |= (u64)fmask << sh; | ||
182 | value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; | ||
183 | } | ||
184 | /* | ||
185 | * Bus events on bytes 0 and 2 can be counted | ||
186 | * on PMC1/2; bytes 1 and 3 on PMC3/4. | ||
187 | */ | ||
188 | if (!pmc) | ||
189 | grp = byte & 1; | ||
190 | /* Set byte lane select field */ | ||
191 | mask |= 0xfULL << (24 - 4 * byte); | ||
192 | value |= (u64)unit << (24 - 4 * byte); | ||
193 | } | ||
194 | if (grp == 0) { | ||
195 | /* increment PMC1/2 field */ | ||
196 | mask |= 0x200000000ull; | ||
197 | value |= 0x080000000ull; | ||
198 | } else if (grp == 1) { | ||
199 | /* increment PMC3/4 field */ | ||
200 | mask |= 0x40000000ull; | ||
201 | value |= 0x10000000ull; | ||
202 | } | ||
203 | if (pmc < 5) { | ||
204 | /* need a counter from PMC1-4 set */ | ||
205 | mask |= 0x8000000000000ull; | ||
206 | value |= 0x1000000000000ull; | ||
207 | } | ||
208 | *maskp = mask; | ||
209 | *valp = value; | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | #define MAX_ALT 3 /* at most 3 alternatives for any event */ | ||
214 | |||
215 | static const unsigned int event_alternatives[][MAX_ALT] = { | ||
216 | { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ | ||
217 | { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ | ||
218 | { 0x100005, 0x600005 }, /* PM_RUN_CYC */ | ||
219 | { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */ | ||
220 | { 0x300009, 0x400009 }, /* PM_INST_DISP */ | ||
221 | }; | ||
222 | |||
223 | /* | ||
224 | * Scan the alternatives table for a match and return the | ||
225 | * index into the alternatives table if found, else -1. | ||
226 | */ | ||
227 | static int find_alternative(unsigned int event) | ||
228 | { | ||
229 | int i, j; | ||
230 | |||
231 | for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { | ||
232 | if (event < event_alternatives[i][0]) | ||
233 | break; | ||
234 | for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) | ||
235 | if (event == event_alternatives[i][j]) | ||
236 | return i; | ||
237 | } | ||
238 | return -1; | ||
239 | } | ||
240 | |||
241 | static const unsigned char bytedecode_alternatives[4][4] = { | ||
242 | /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, | ||
243 | /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, | ||
244 | /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, | ||
245 | /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } | ||
246 | }; | ||
247 | |||
248 | /* | ||
249 | * Some direct events for decodes of event bus byte 3 have alternative | ||
250 | * PMCSEL values on other counters. This returns the alternative | ||
251 | * event code for those that do, or -1 otherwise. | ||
252 | */ | ||
253 | static int find_alternative_bdecode(unsigned int event) | ||
254 | { | ||
255 | int pmc, altpmc, pp, j; | ||
256 | |||
257 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
258 | if (pmc == 0 || pmc > 4) | ||
259 | return -1; | ||
260 | altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ | ||
261 | pp = event & PM_PMCSEL_MSK; | ||
262 | for (j = 0; j < 4; ++j) { | ||
263 | if (bytedecode_alternatives[pmc - 1][j] == pp) { | ||
264 | return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | | ||
265 | (altpmc << PM_PMC_SH) | | ||
266 | bytedecode_alternatives[altpmc - 1][j]; | ||
267 | } | ||
268 | } | ||
269 | return -1; | ||
270 | } | ||
271 | |||
272 | static int power5_get_alternatives(unsigned int event, unsigned int alt[]) | ||
273 | { | ||
274 | int i, j, ae, nalt = 1; | ||
275 | |||
276 | alt[0] = event; | ||
277 | nalt = 1; | ||
278 | i = find_alternative(event); | ||
279 | if (i >= 0) { | ||
280 | for (j = 0; j < MAX_ALT; ++j) { | ||
281 | ae = event_alternatives[i][j]; | ||
282 | if (ae && ae != event) | ||
283 | alt[nalt++] = ae; | ||
284 | } | ||
285 | } else { | ||
286 | ae = find_alternative_bdecode(event); | ||
287 | if (ae > 0) | ||
288 | alt[nalt++] = ae; | ||
289 | } | ||
290 | return nalt; | ||
291 | } | ||
292 | |||
293 | static int power5_compute_mmcr(unsigned int event[], int n_ev, | ||
294 | unsigned int hwc[], u64 mmcr[]) | ||
295 | { | ||
296 | u64 mmcr1 = 0; | ||
297 | unsigned int pmc, unit, byte, psel; | ||
298 | unsigned int ttm, grp; | ||
299 | int i, isbus, bit, grsel; | ||
300 | unsigned int pmc_inuse = 0; | ||
301 | unsigned int pmc_grp_use[2]; | ||
302 | unsigned char busbyte[4]; | ||
303 | unsigned char unituse[16]; | ||
304 | int ttmuse; | ||
305 | |||
306 | if (n_ev > 6) | ||
307 | return -1; | ||
308 | |||
309 | /* First pass to count resource use */ | ||
310 | pmc_grp_use[0] = pmc_grp_use[1] = 0; | ||
311 | memset(busbyte, 0, sizeof(busbyte)); | ||
312 | memset(unituse, 0, sizeof(unituse)); | ||
313 | for (i = 0; i < n_ev; ++i) { | ||
314 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
315 | if (pmc) { | ||
316 | if (pmc > 6) | ||
317 | return -1; | ||
318 | if (pmc_inuse & (1 << (pmc - 1))) | ||
319 | return -1; | ||
320 | pmc_inuse |= 1 << (pmc - 1); | ||
321 | /* count 1/2 vs 3/4 use */ | ||
322 | if (pmc <= 4) | ||
323 | ++pmc_grp_use[(pmc - 1) >> 1]; | ||
324 | } | ||
325 | if (event[i] & PM_BUSEVENT_MSK) { | ||
326 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
327 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
328 | if (unit > PM_LASTUNIT) | ||
329 | return -1; | ||
330 | if (unit == PM_ISU0_ALT) | ||
331 | unit = PM_ISU0; | ||
332 | if (byte >= 4) { | ||
333 | if (unit != PM_LSU1) | ||
334 | return -1; | ||
335 | ++unit; | ||
336 | byte &= 3; | ||
337 | } | ||
338 | if (!pmc) | ||
339 | ++pmc_grp_use[byte & 1]; | ||
340 | if (busbyte[byte] && busbyte[byte] != unit) | ||
341 | return -1; | ||
342 | busbyte[byte] = unit; | ||
343 | unituse[unit] = 1; | ||
344 | } | ||
345 | } | ||
346 | if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) | ||
347 | return -1; | ||
348 | |||
349 | /* | ||
350 | * Assign resources and set multiplexer selects. | ||
351 | * | ||
352 | * PM_ISU0 can go either on TTM0 or TTM1, but that's the only | ||
353 | * choice we have to deal with. | ||
354 | */ | ||
355 | if (unituse[PM_ISU0] & | ||
356 | (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { | ||
357 | unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ | ||
358 | unituse[PM_ISU0] = 0; | ||
359 | } | ||
360 | /* Set TTM[01]SEL fields. */ | ||
361 | ttmuse = 0; | ||
362 | for (i = PM_FPU; i <= PM_ISU1; ++i) { | ||
363 | if (!unituse[i]) | ||
364 | continue; | ||
365 | if (ttmuse++) | ||
366 | return -1; | ||
367 | mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; | ||
368 | } | ||
369 | ttmuse = 0; | ||
370 | for (; i <= PM_GRS; ++i) { | ||
371 | if (!unituse[i]) | ||
372 | continue; | ||
373 | if (ttmuse++) | ||
374 | return -1; | ||
375 | mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; | ||
376 | } | ||
377 | if (ttmuse > 1) | ||
378 | return -1; | ||
379 | |||
380 | /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ | ||
381 | for (byte = 0; byte < 4; ++byte) { | ||
382 | unit = busbyte[byte]; | ||
383 | if (!unit) | ||
384 | continue; | ||
385 | if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { | ||
386 | /* get ISU0 through TTM1 rather than TTM0 */ | ||
387 | unit = PM_ISU0_ALT; | ||
388 | } else if (unit == PM_LSU1 + 1) { | ||
389 | /* select lower word of LSU1 for this byte */ | ||
390 | mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); | ||
391 | } | ||
392 | ttm = unit >> 2; | ||
393 | mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); | ||
394 | } | ||
395 | |||
396 | /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ | ||
397 | for (i = 0; i < n_ev; ++i) { | ||
398 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
399 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
400 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
401 | psel = event[i] & PM_PMCSEL_MSK; | ||
402 | isbus = event[i] & PM_BUSEVENT_MSK; | ||
403 | if (!pmc) { | ||
404 | /* Bus event or any-PMC direct event */ | ||
405 | for (pmc = 0; pmc < 4; ++pmc) { | ||
406 | if (pmc_inuse & (1 << pmc)) | ||
407 | continue; | ||
408 | grp = (pmc >> 1) & 1; | ||
409 | if (isbus) { | ||
410 | if (grp == (byte & 1)) | ||
411 | break; | ||
412 | } else if (pmc_grp_use[grp] < 2) { | ||
413 | ++pmc_grp_use[grp]; | ||
414 | break; | ||
415 | } | ||
416 | } | ||
417 | pmc_inuse |= 1 << pmc; | ||
418 | } else if (pmc <= 4) { | ||
419 | /* Direct event */ | ||
420 | --pmc; | ||
421 | if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) | ||
422 | /* add events on higher-numbered bus */ | ||
423 | mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); | ||
424 | } else { | ||
425 | /* Instructions or run cycles on PMC5/6 */ | ||
426 | --pmc; | ||
427 | } | ||
428 | if (isbus && unit == PM_GRS) { | ||
429 | bit = psel & 7; | ||
430 | grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; | ||
431 | mmcr1 |= (u64)grsel << grsel_shift[bit]; | ||
432 | } | ||
433 | if (pmc <= 3) | ||
434 | mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); | ||
435 | hwc[i] = pmc; | ||
436 | } | ||
437 | |||
438 | /* Return MMCRx values */ | ||
439 | mmcr[0] = 0; | ||
440 | if (pmc_inuse & 1) | ||
441 | mmcr[0] = MMCR0_PMC1CE; | ||
442 | if (pmc_inuse & 0x3e) | ||
443 | mmcr[0] |= MMCR0_PMCjCE; | ||
444 | mmcr[1] = mmcr1; | ||
445 | mmcr[2] = 0; | ||
446 | return 0; | ||
447 | } | ||
448 | |||
449 | static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) | ||
450 | { | ||
451 | if (pmc <= 3) | ||
452 | mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); | ||
453 | } | ||
454 | |||
455 | static int power5_generic_events[] = { | ||
456 | [PERF_COUNT_CPU_CYCLES] = 0xf, | ||
457 | [PERF_COUNT_INSTRUCTIONS] = 0x100009, | ||
458 | [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ | ||
459 | [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ | ||
460 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ | ||
461 | [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ | ||
462 | }; | ||
463 | |||
464 | struct power_pmu power5_pmu = { | ||
465 | .n_counter = 6, | ||
466 | .max_alternatives = MAX_ALT, | ||
467 | .add_fields = 0x7000090000555ull, | ||
468 | .test_adder = 0x3000490000000ull, | ||
469 | .compute_mmcr = power5_compute_mmcr, | ||
470 | .get_constraint = power5_get_constraint, | ||
471 | .get_alternatives = power5_get_alternatives, | ||
472 | .disable_pmc = power5_disable_pmc, | ||
473 | .n_generic = ARRAY_SIZE(power5_generic_events), | ||
474 | .generic_events = power5_generic_events, | ||
475 | }; | ||
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c new file mode 100644 index 000000000000..b1f61f3c97bb --- /dev/null +++ b/arch/powerpc/kernel/power6-pmu.c | |||
@@ -0,0 +1,283 @@ | |||
1 | /* | ||
2 | * Performance counter support for POWER6 processors. | ||
3 | * | ||
4 | * Copyright 2008-2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/perf_counter.h> | ||
13 | #include <asm/reg.h> | ||
14 | |||
15 | /* | ||
16 | * Bits in event code for POWER6 | ||
17 | */ | ||
18 | #define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ | ||
19 | #define PM_PMC_MSK 0x7 | ||
20 | #define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) | ||
21 | #define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */ | ||
22 | #define PM_UNIT_MSK 0xf | ||
23 | #define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH) | ||
24 | #define PM_LLAV 0x8000 /* Load lookahead match value */ | ||
25 | #define PM_LLA 0x4000 /* Load lookahead match enable */ | ||
26 | #define PM_BYTE_SH 12 /* Byte of event bus to use */ | ||
27 | #define PM_BYTE_MSK 3 | ||
28 | #define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */ | ||
29 | #define PM_SUBUNIT_MSK 7 | ||
30 | #define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH) | ||
31 | #define PM_PMCSEL_MSK 0xff /* PMCxSEL value */ | ||
32 | #define PM_BUSEVENT_MSK 0xf3700 | ||
33 | |||
34 | /* | ||
35 | * Bits in MMCR1 for POWER6 | ||
36 | */ | ||
37 | #define MMCR1_TTM0SEL_SH 60 | ||
38 | #define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4) | ||
39 | #define MMCR1_TTMSEL_MSK 0xf | ||
40 | #define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) | ||
41 | #define MMCR1_NESTSEL_SH 45 | ||
42 | #define MMCR1_NESTSEL_MSK 0x7 | ||
43 | #define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) | ||
44 | #define MMCR1_PMC1_LLA ((u64)1 << 44) | ||
45 | #define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39) | ||
46 | #define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35) | ||
47 | #define MMCR1_PMC1SEL_SH 24 | ||
48 | #define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) | ||
49 | #define MMCR1_PMCSEL_MSK 0xff | ||
50 | |||
51 | /* | ||
52 | * Assign PMC numbers and compute MMCR1 value for a set of events | ||
53 | */ | ||
54 | static int p6_compute_mmcr(unsigned int event[], int n_ev, | ||
55 | unsigned int hwc[], u64 mmcr[]) | ||
56 | { | ||
57 | u64 mmcr1 = 0; | ||
58 | int i; | ||
59 | unsigned int pmc, ev, b, u, s, psel; | ||
60 | unsigned int ttmset = 0; | ||
61 | unsigned int pmc_inuse = 0; | ||
62 | |||
63 | if (n_ev > 4) | ||
64 | return -1; | ||
65 | for (i = 0; i < n_ev; ++i) { | ||
66 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
67 | if (pmc) { | ||
68 | if (pmc_inuse & (1 << (pmc - 1))) | ||
69 | return -1; /* collision! */ | ||
70 | pmc_inuse |= 1 << (pmc - 1); | ||
71 | } | ||
72 | } | ||
73 | for (i = 0; i < n_ev; ++i) { | ||
74 | ev = event[i]; | ||
75 | pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; | ||
76 | if (pmc) { | ||
77 | --pmc; | ||
78 | } else { | ||
79 | /* can go on any PMC; find a free one */ | ||
80 | for (pmc = 0; pmc < 4; ++pmc) | ||
81 | if (!(pmc_inuse & (1 << pmc))) | ||
82 | break; | ||
83 | pmc_inuse |= 1 << pmc; | ||
84 | } | ||
85 | hwc[i] = pmc; | ||
86 | psel = ev & PM_PMCSEL_MSK; | ||
87 | if (ev & PM_BUSEVENT_MSK) { | ||
88 | /* this event uses the event bus */ | ||
89 | b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
90 | u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
91 | /* check for conflict on this byte of event bus */ | ||
92 | if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) | ||
93 | return -1; | ||
94 | mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b); | ||
95 | ttmset |= 1 << b; | ||
96 | if (u == 5) { | ||
97 | /* Nest events have a further mux */ | ||
98 | s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; | ||
99 | if ((ttmset & 0x10) && | ||
100 | MMCR1_NESTSEL(mmcr1) != s) | ||
101 | return -1; | ||
102 | ttmset |= 0x10; | ||
103 | mmcr1 |= (u64)s << MMCR1_NESTSEL_SH; | ||
104 | } | ||
105 | if (0x30 <= psel && psel <= 0x3d) { | ||
106 | /* these need the PMCx_ADDR_SEL bits */ | ||
107 | if (b >= 2) | ||
108 | mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; | ||
109 | } | ||
110 | /* bus select values are different for PMC3/4 */ | ||
111 | if (pmc >= 2 && (psel & 0x90) == 0x80) | ||
112 | psel ^= 0x20; | ||
113 | } | ||
114 | if (ev & PM_LLA) { | ||
115 | mmcr1 |= MMCR1_PMC1_LLA >> pmc; | ||
116 | if (ev & PM_LLAV) | ||
117 | mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; | ||
118 | } | ||
119 | mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); | ||
120 | } | ||
121 | mmcr[0] = 0; | ||
122 | if (pmc_inuse & 1) | ||
123 | mmcr[0] = MMCR0_PMC1CE; | ||
124 | if (pmc_inuse & 0xe) | ||
125 | mmcr[0] |= MMCR0_PMCjCE; | ||
126 | mmcr[1] = mmcr1; | ||
127 | mmcr[2] = 0; | ||
128 | return 0; | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Layout of constraint bits: | ||
133 | * | ||
134 | * 0-1 add field: number of uses of PMC1 (max 1) | ||
135 | * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 | ||
136 | * 8-10 select field: nest (subunit) event selector | ||
137 | * 16-19 select field: unit on byte 0 of event bus | ||
138 | * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 | ||
139 | */ | ||
140 | static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) | ||
141 | { | ||
142 | int pmc, byte, sh; | ||
143 | unsigned int mask = 0, value = 0; | ||
144 | |||
145 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
146 | if (pmc) { | ||
147 | if (pmc > 4) | ||
148 | return -1; | ||
149 | sh = (pmc - 1) * 2; | ||
150 | mask |= 2 << sh; | ||
151 | value |= 1 << sh; | ||
152 | } | ||
153 | if (event & PM_BUSEVENT_MSK) { | ||
154 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
155 | sh = byte * 4; | ||
156 | mask |= PM_UNIT_MSKS << sh; | ||
157 | value |= (event & PM_UNIT_MSKS) << sh; | ||
158 | if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { | ||
159 | mask |= PM_SUBUNIT_MSKS; | ||
160 | value |= event & PM_SUBUNIT_MSKS; | ||
161 | } | ||
162 | } | ||
163 | *maskp = mask; | ||
164 | *valp = value; | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | #define MAX_ALT 4 /* at most 4 alternatives for any event */ | ||
169 | |||
170 | static const unsigned int event_alternatives[][MAX_ALT] = { | ||
171 | { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ | ||
172 | { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ | ||
173 | { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ | ||
174 | { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ | ||
175 | { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ | ||
176 | { 0x10000e, 0x400010 }, /* PM_PURR */ | ||
177 | { 0x100010, 0x4000f8 }, /* PM_FLUSH */ | ||
178 | { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */ | ||
179 | { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */ | ||
180 | { 0x100054, 0x2000f0 }, /* PM_ST_FIN */ | ||
181 | { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */ | ||
182 | { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */ | ||
183 | { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */ | ||
184 | { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */ | ||
185 | { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */ | ||
186 | { 0x200012, 0x300012 }, /* PM_INST_DISP */ | ||
187 | { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */ | ||
188 | { 0x2000f8, 0x300010 }, /* PM_EXT_INT */ | ||
189 | { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */ | ||
190 | { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */ | ||
191 | { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */ | ||
192 | { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */ | ||
193 | { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ | ||
194 | }; | ||
195 | |||
196 | /* | ||
197 | * This could be made more efficient with a binary search on | ||
198 | * a presorted list, if necessary | ||
199 | */ | ||
200 | static int find_alternatives_list(unsigned int event) | ||
201 | { | ||
202 | int i, j; | ||
203 | unsigned int alt; | ||
204 | |||
205 | for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { | ||
206 | if (event < event_alternatives[i][0]) | ||
207 | return -1; | ||
208 | for (j = 0; j < MAX_ALT; ++j) { | ||
209 | alt = event_alternatives[i][j]; | ||
210 | if (!alt || event < alt) | ||
211 | break; | ||
212 | if (event == alt) | ||
213 | return i; | ||
214 | } | ||
215 | } | ||
216 | return -1; | ||
217 | } | ||
218 | |||
219 | static int p6_get_alternatives(unsigned int event, unsigned int alt[]) | ||
220 | { | ||
221 | int i, j; | ||
222 | unsigned int aevent, psel, pmc; | ||
223 | unsigned int nalt = 1; | ||
224 | |||
225 | alt[0] = event; | ||
226 | |||
227 | /* check the alternatives table */ | ||
228 | i = find_alternatives_list(event); | ||
229 | if (i >= 0) { | ||
230 | /* copy out alternatives from list */ | ||
231 | for (j = 0; j < MAX_ALT; ++j) { | ||
232 | aevent = event_alternatives[i][j]; | ||
233 | if (!aevent) | ||
234 | break; | ||
235 | if (aevent != event) | ||
236 | alt[nalt++] = aevent; | ||
237 | } | ||
238 | |||
239 | } else { | ||
240 | /* Check for alternative ways of computing sum events */ | ||
241 | /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ | ||
242 | psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */ | ||
243 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
244 | if (pmc && (psel == 0x32 || psel == 0x34)) | ||
245 | alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | | ||
246 | ((5 - pmc) << PM_PMC_SH); | ||
247 | |||
248 | /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ | ||
249 | if (pmc && (psel == 0x38 || psel == 0x3a)) | ||
250 | alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | | ||
251 | ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); | ||
252 | } | ||
253 | |||
254 | return nalt; | ||
255 | } | ||
256 | |||
257 | static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) | ||
258 | { | ||
259 | /* Set PMCxSEL to 0 to disable PMCx */ | ||
260 | mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); | ||
261 | } | ||
262 | |||
263 | static int power6_generic_events[] = { | ||
264 | [PERF_COUNT_CPU_CYCLES] = 0x1e, | ||
265 | [PERF_COUNT_INSTRUCTIONS] = 2, | ||
266 | [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ | ||
267 | [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ | ||
268 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ | ||
269 | [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ | ||
270 | }; | ||
271 | |||
272 | struct power_pmu power6_pmu = { | ||
273 | .n_counter = 4, | ||
274 | .max_alternatives = MAX_ALT, | ||
275 | .add_fields = 0x55, | ||
276 | .test_adder = 0, | ||
277 | .compute_mmcr = p6_compute_mmcr, | ||
278 | .get_constraint = p6_get_constraint, | ||
279 | .get_alternatives = p6_get_alternatives, | ||
280 | .disable_pmc = p6_disable_pmc, | ||
281 | .n_generic = ARRAY_SIZE(power6_generic_events), | ||
282 | .generic_events = power6_generic_events, | ||
283 | }; | ||
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c new file mode 100644 index 000000000000..c3256580be1a --- /dev/null +++ b/arch/powerpc/kernel/ppc970-pmu.c | |||
@@ -0,0 +1,375 @@ | |||
1 | /* | ||
2 | * Performance counter support for PPC970-family processors. | ||
3 | * | ||
4 | * Copyright 2008-2009 Paul Mackerras, IBM Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/perf_counter.h> | ||
13 | #include <asm/reg.h> | ||
14 | |||
15 | /* | ||
16 | * Bits in event code for PPC970 | ||
17 | */ | ||
18 | #define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ | ||
19 | #define PM_PMC_MSK 0xf | ||
20 | #define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ | ||
21 | #define PM_UNIT_MSK 0xf | ||
22 | #define PM_BYTE_SH 4 /* Byte number of event bus to use */ | ||
23 | #define PM_BYTE_MSK 3 | ||
24 | #define PM_PMCSEL_MSK 0xf | ||
25 | |||
26 | /* Values in PM_UNIT field */ | ||
27 | #define PM_NONE 0 | ||
28 | #define PM_FPU 1 | ||
29 | #define PM_VPU 2 | ||
30 | #define PM_ISU 3 | ||
31 | #define PM_IFU 4 | ||
32 | #define PM_IDU 5 | ||
33 | #define PM_STS 6 | ||
34 | #define PM_LSU0 7 | ||
35 | #define PM_LSU1U 8 | ||
36 | #define PM_LSU1L 9 | ||
37 | #define PM_LASTUNIT 9 | ||
38 | |||
39 | /* | ||
40 | * Bits in MMCR0 for PPC970 | ||
41 | */ | ||
42 | #define MMCR0_PMC1SEL_SH 8 | ||
43 | #define MMCR0_PMC2SEL_SH 1 | ||
44 | #define MMCR_PMCSEL_MSK 0x1f | ||
45 | |||
46 | /* | ||
47 | * Bits in MMCR1 for PPC970 | ||
48 | */ | ||
49 | #define MMCR1_TTM0SEL_SH 62 | ||
50 | #define MMCR1_TTM1SEL_SH 59 | ||
51 | #define MMCR1_TTM3SEL_SH 53 | ||
52 | #define MMCR1_TTMSEL_MSK 3 | ||
53 | #define MMCR1_TD_CP_DBG0SEL_SH 50 | ||
54 | #define MMCR1_TD_CP_DBG1SEL_SH 48 | ||
55 | #define MMCR1_TD_CP_DBG2SEL_SH 46 | ||
56 | #define MMCR1_TD_CP_DBG3SEL_SH 44 | ||
57 | #define MMCR1_PMC1_ADDER_SEL_SH 39 | ||
58 | #define MMCR1_PMC2_ADDER_SEL_SH 38 | ||
59 | #define MMCR1_PMC6_ADDER_SEL_SH 37 | ||
60 | #define MMCR1_PMC5_ADDER_SEL_SH 36 | ||
61 | #define MMCR1_PMC8_ADDER_SEL_SH 35 | ||
62 | #define MMCR1_PMC7_ADDER_SEL_SH 34 | ||
63 | #define MMCR1_PMC3_ADDER_SEL_SH 33 | ||
64 | #define MMCR1_PMC4_ADDER_SEL_SH 32 | ||
65 | #define MMCR1_PMC3SEL_SH 27 | ||
66 | #define MMCR1_PMC4SEL_SH 22 | ||
67 | #define MMCR1_PMC5SEL_SH 17 | ||
68 | #define MMCR1_PMC6SEL_SH 12 | ||
69 | #define MMCR1_PMC7SEL_SH 7 | ||
70 | #define MMCR1_PMC8SEL_SH 2 | ||
71 | |||
72 | static short mmcr1_adder_bits[8] = { | ||
73 | MMCR1_PMC1_ADDER_SEL_SH, | ||
74 | MMCR1_PMC2_ADDER_SEL_SH, | ||
75 | MMCR1_PMC3_ADDER_SEL_SH, | ||
76 | MMCR1_PMC4_ADDER_SEL_SH, | ||
77 | MMCR1_PMC5_ADDER_SEL_SH, | ||
78 | MMCR1_PMC6_ADDER_SEL_SH, | ||
79 | MMCR1_PMC7_ADDER_SEL_SH, | ||
80 | MMCR1_PMC8_ADDER_SEL_SH | ||
81 | }; | ||
82 | |||
83 | /* | ||
84 | * Bits in MMCRA | ||
85 | */ | ||
86 | |||
87 | /* | ||
88 | * Layout of constraint bits: | ||
89 | * 6666555555555544444444443333333333222222222211111111110000000000 | ||
90 | * 3210987654321098765432109876543210987654321098765432109876543210 | ||
91 | * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> | ||
92 | * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 | ||
93 | * | ||
94 | * T0 - TTM0 constraint | ||
95 | * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 | ||
96 | * | ||
97 | * T1 - TTM1 constraint | ||
98 | * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 | ||
99 | * | ||
100 | * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS | ||
101 | * 43: UC3 error 0x0800_0000_0000 | ||
102 | * 42: FPU|IFU|VPU events needed 0x0400_0000_0000 | ||
103 | * 41: ISU events needed 0x0200_0000_0000 | ||
104 | * 40: IDU|STS events needed 0x0100_0000_0000 | ||
105 | * | ||
106 | * PS1 | ||
107 | * 39: PS1 error 0x0080_0000_0000 | ||
108 | * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 | ||
109 | * | ||
110 | * PS2 | ||
111 | * 35: PS2 error 0x0008_0000_0000 | ||
112 | * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 | ||
113 | * | ||
114 | * B0 | ||
115 | * 28-31: Byte 0 event source 0xf000_0000 | ||
116 | * Encoding as for the event code | ||
117 | * | ||
118 | * B1, B2, B3 | ||
119 | * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources | ||
120 | * | ||
121 | * P1 | ||
122 | * 15: P1 error 0x8000 | ||
123 | * 14-15: Count of events needing PMC1 | ||
124 | * | ||
125 | * P2..P8 | ||
126 | * 0-13: Count of events needing PMC2..PMC8 | ||
127 | */ | ||
128 | |||
129 | /* Masks and values for using events from the various units */ | ||
130 | static u64 unit_cons[PM_LASTUNIT+1][2] = { | ||
131 | [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, | ||
132 | [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull }, | ||
133 | [PM_ISU] = { 0x080000000000ull, 0x020000000000ull }, | ||
134 | [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull }, | ||
135 | [PM_IDU] = { 0x380000000000ull, 0x010000000000ull }, | ||
136 | [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, | ||
137 | }; | ||
138 | |||
139 | static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) | ||
140 | { | ||
141 | int pmc, byte, unit, sh; | ||
142 | u64 mask = 0, value = 0; | ||
143 | int grp = -1; | ||
144 | |||
145 | pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; | ||
146 | if (pmc) { | ||
147 | if (pmc > 8) | ||
148 | return -1; | ||
149 | sh = (pmc - 1) * 2; | ||
150 | mask |= 2 << sh; | ||
151 | value |= 1 << sh; | ||
152 | grp = ((pmc - 1) >> 1) & 1; | ||
153 | } | ||
154 | unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
155 | if (unit) { | ||
156 | if (unit > PM_LASTUNIT) | ||
157 | return -1; | ||
158 | mask |= unit_cons[unit][0]; | ||
159 | value |= unit_cons[unit][1]; | ||
160 | byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
161 | /* | ||
162 | * Bus events on bytes 0 and 2 can be counted | ||
163 | * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. | ||
164 | */ | ||
165 | if (!pmc) | ||
166 | grp = byte & 1; | ||
167 | /* Set byte lane select field */ | ||
168 | mask |= 0xfULL << (28 - 4 * byte); | ||
169 | value |= (u64)unit << (28 - 4 * byte); | ||
170 | } | ||
171 | if (grp == 0) { | ||
172 | /* increment PMC1/2/5/6 field */ | ||
173 | mask |= 0x8000000000ull; | ||
174 | value |= 0x1000000000ull; | ||
175 | } else if (grp == 1) { | ||
176 | /* increment PMC3/4/7/8 field */ | ||
177 | mask |= 0x800000000ull; | ||
178 | value |= 0x100000000ull; | ||
179 | } | ||
180 | *maskp = mask; | ||
181 | *valp = value; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | static int p970_get_alternatives(unsigned int event, unsigned int alt[]) | ||
186 | { | ||
187 | alt[0] = event; | ||
188 | |||
189 | /* 2 alternatives for LSU empty */ | ||
190 | if (event == 0x2002 || event == 0x3002) { | ||
191 | alt[1] = event ^ 0x1000; | ||
192 | return 2; | ||
193 | } | ||
194 | |||
195 | return 1; | ||
196 | } | ||
197 | |||
198 | static int p970_compute_mmcr(unsigned int event[], int n_ev, | ||
199 | unsigned int hwc[], u64 mmcr[]) | ||
200 | { | ||
201 | u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; | ||
202 | unsigned int pmc, unit, byte, psel; | ||
203 | unsigned int ttm, grp; | ||
204 | unsigned int pmc_inuse = 0; | ||
205 | unsigned int pmc_grp_use[2]; | ||
206 | unsigned char busbyte[4]; | ||
207 | unsigned char unituse[16]; | ||
208 | unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; | ||
209 | unsigned char ttmuse[2]; | ||
210 | unsigned char pmcsel[8]; | ||
211 | int i; | ||
212 | |||
213 | if (n_ev > 8) | ||
214 | return -1; | ||
215 | |||
216 | /* First pass to count resource use */ | ||
217 | pmc_grp_use[0] = pmc_grp_use[1] = 0; | ||
218 | memset(busbyte, 0, sizeof(busbyte)); | ||
219 | memset(unituse, 0, sizeof(unituse)); | ||
220 | for (i = 0; i < n_ev; ++i) { | ||
221 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
222 | if (pmc) { | ||
223 | if (pmc_inuse & (1 << (pmc - 1))) | ||
224 | return -1; | ||
225 | pmc_inuse |= 1 << (pmc - 1); | ||
226 | /* count 1/2/5/6 vs 3/4/7/8 use */ | ||
227 | ++pmc_grp_use[((pmc - 1) >> 1) & 1]; | ||
228 | } | ||
229 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
230 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
231 | if (unit) { | ||
232 | if (unit > PM_LASTUNIT) | ||
233 | return -1; | ||
234 | if (!pmc) | ||
235 | ++pmc_grp_use[byte & 1]; | ||
236 | if (busbyte[byte] && busbyte[byte] != unit) | ||
237 | return -1; | ||
238 | busbyte[byte] = unit; | ||
239 | unituse[unit] = 1; | ||
240 | } | ||
241 | } | ||
242 | if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) | ||
243 | return -1; | ||
244 | |||
245 | /* | ||
246 | * Assign resources and set multiplexer selects. | ||
247 | * | ||
248 | * PM_ISU can go either on TTM0 or TTM1, but that's the only | ||
249 | * choice we have to deal with. | ||
250 | */ | ||
251 | if (unituse[PM_ISU] & | ||
252 | (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) | ||
253 | unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */ | ||
254 | /* Set TTM[01]SEL fields. */ | ||
255 | ttmuse[0] = ttmuse[1] = 0; | ||
256 | for (i = PM_FPU; i <= PM_STS; ++i) { | ||
257 | if (!unituse[i]) | ||
258 | continue; | ||
259 | ttm = unitmap[i]; | ||
260 | ++ttmuse[(ttm >> 2) & 1]; | ||
261 | mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH; | ||
262 | } | ||
263 | /* Check only one unit per TTMx */ | ||
264 | if (ttmuse[0] > 1 || ttmuse[1] > 1) | ||
265 | return -1; | ||
266 | |||
267 | /* Set byte lane select fields and TTM3SEL. */ | ||
268 | for (byte = 0; byte < 4; ++byte) { | ||
269 | unit = busbyte[byte]; | ||
270 | if (!unit) | ||
271 | continue; | ||
272 | if (unit <= PM_STS) | ||
273 | ttm = (unitmap[unit] >> 2) & 1; | ||
274 | else if (unit == PM_LSU0) | ||
275 | ttm = 2; | ||
276 | else { | ||
277 | ttm = 3; | ||
278 | if (unit == PM_LSU1L && byte >= 2) | ||
279 | mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); | ||
280 | } | ||
281 | mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); | ||
282 | } | ||
283 | |||
284 | /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ | ||
285 | memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */ | ||
286 | for (i = 0; i < n_ev; ++i) { | ||
287 | pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; | ||
288 | unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; | ||
289 | byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; | ||
290 | psel = event[i] & PM_PMCSEL_MSK; | ||
291 | if (!pmc) { | ||
292 | /* Bus event or any-PMC direct event */ | ||
293 | if (unit) | ||
294 | psel |= 0x10 | ((byte & 2) << 2); | ||
295 | else | ||
296 | psel |= 8; | ||
297 | for (pmc = 0; pmc < 8; ++pmc) { | ||
298 | if (pmc_inuse & (1 << pmc)) | ||
299 | continue; | ||
300 | grp = (pmc >> 1) & 1; | ||
301 | if (unit) { | ||
302 | if (grp == (byte & 1)) | ||
303 | break; | ||
304 | } else if (pmc_grp_use[grp] < 4) { | ||
305 | ++pmc_grp_use[grp]; | ||
306 | break; | ||
307 | } | ||
308 | } | ||
309 | pmc_inuse |= 1 << pmc; | ||
310 | } else { | ||
311 | /* Direct event */ | ||
312 | --pmc; | ||
313 | if (psel == 0 && (byte & 2)) | ||
314 | /* add events on higher-numbered bus */ | ||
315 | mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; | ||
316 | } | ||
317 | pmcsel[pmc] = psel; | ||
318 | hwc[i] = pmc; | ||
319 | } | ||
320 | for (pmc = 0; pmc < 2; ++pmc) | ||
321 | mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); | ||
322 | for (; pmc < 8; ++pmc) | ||
323 | mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); | ||
324 | if (pmc_inuse & 1) | ||
325 | mmcr0 |= MMCR0_PMC1CE; | ||
326 | if (pmc_inuse & 0xfe) | ||
327 | mmcr0 |= MMCR0_PMCjCE; | ||
328 | |||
329 | mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ | ||
330 | |||
331 | /* Return MMCRx values */ | ||
332 | mmcr[0] = mmcr0; | ||
333 | mmcr[1] = mmcr1; | ||
334 | mmcr[2] = mmcra; | ||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) | ||
339 | { | ||
340 | int shift, i; | ||
341 | |||
342 | if (pmc <= 1) { | ||
343 | shift = MMCR0_PMC1SEL_SH - 7 * pmc; | ||
344 | i = 0; | ||
345 | } else { | ||
346 | shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); | ||
347 | i = 1; | ||
348 | } | ||
349 | /* | ||
350 | * Setting the PMCxSEL field to 0x08 disables PMC x. | ||
351 | */ | ||
352 | mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); | ||
353 | } | ||
354 | |||
355 | static int ppc970_generic_events[] = { | ||
356 | [PERF_COUNT_CPU_CYCLES] = 7, | ||
357 | [PERF_COUNT_INSTRUCTIONS] = 1, | ||
358 | [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ | ||
359 | [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ | ||
360 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ | ||
361 | [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ | ||
362 | }; | ||
363 | |||
364 | struct power_pmu ppc970_pmu = { | ||
365 | .n_counter = 8, | ||
366 | .max_alternatives = 2, | ||
367 | .add_fields = 0x001100005555ull, | ||
368 | .test_adder = 0x013300000000ull, | ||
369 | .compute_mmcr = p970_compute_mmcr, | ||
370 | .get_constraint = p970_get_constraint, | ||
371 | .get_alternatives = p970_get_alternatives, | ||
372 | .disable_pmc = p970_disable_pmc, | ||
373 | .n_generic = ARRAY_SIZE(ppc970_generic_events), | ||
374 | .generic_events = ppc970_generic_events, | ||
375 | }; | ||
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 76993941cac9..17bbf6f91fbe 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/kprobes.h> | 30 | #include <linux/kprobes.h> |
31 | #include <linux/kdebug.h> | 31 | #include <linux/kdebug.h> |
32 | #include <linux/perf_counter.h> | ||
32 | 33 | ||
33 | #include <asm/firmware.h> | 34 | #include <asm/firmware.h> |
34 | #include <asm/page.h> | 35 | #include <asm/page.h> |
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, | |||
170 | die("Weird page fault", regs, SIGSEGV); | 171 | die("Weird page fault", regs, SIGSEGV); |
171 | } | 172 | } |
172 | 173 | ||
174 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); | ||
175 | |||
173 | /* When running in the kernel we expect faults to occur only to | 176 | /* When running in the kernel we expect faults to occur only to |
174 | * addresses in user space. All other faults represent errors in the | 177 | * addresses in user space. All other faults represent errors in the |
175 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 178 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
@@ -309,6 +312,7 @@ good_area: | |||
309 | } | 312 | } |
310 | if (ret & VM_FAULT_MAJOR) { | 313 | if (ret & VM_FAULT_MAJOR) { |
311 | current->maj_flt++; | 314 | current->maj_flt++; |
315 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); | ||
312 | #ifdef CONFIG_PPC_SMLPAR | 316 | #ifdef CONFIG_PPC_SMLPAR |
313 | if (firmware_has_feature(FW_FEATURE_CMO)) { | 317 | if (firmware_has_feature(FW_FEATURE_CMO)) { |
314 | preempt_disable(); | 318 | preempt_disable(); |
@@ -316,8 +320,10 @@ good_area: | |||
316 | preempt_enable(); | 320 | preempt_enable(); |
317 | } | 321 | } |
318 | #endif | 322 | #endif |
319 | } else | 323 | } else { |
320 | current->min_flt++; | 324 | current->min_flt++; |
325 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); | ||
326 | } | ||
321 | up_read(&mm->mmap_sem); | 327 | up_read(&mm->mmap_sem); |
322 | return 0; | 328 | return 0; |
323 | 329 | ||
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9da795e49337..732ee93a8e98 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype | |||
@@ -1,6 +1,7 @@ | |||
1 | config PPC64 | 1 | config PPC64 |
2 | bool "64-bit kernel" | 2 | bool "64-bit kernel" |
3 | default n | 3 | default n |
4 | select HAVE_PERF_COUNTERS | ||
4 | help | 5 | help |
5 | This option selects whether a 32-bit or a 64-bit kernel | 6 | This option selects whether a 32-bit or a 64-bit kernel |
6 | will be built. | 7 | will be built. |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b3408206091..6da24fc6a09e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -728,6 +728,7 @@ config X86_UP_IOAPIC | |||
728 | config X86_LOCAL_APIC | 728 | config X86_LOCAL_APIC |
729 | def_bool y | 729 | def_bool y |
730 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC | 730 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC |
731 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | ||
731 | 732 | ||
732 | config X86_IO_APIC | 733 | config X86_IO_APIC |
733 | def_bool y | 734 | def_bool y |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a505202086e8..19c61ef6ab57 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -825,9 +825,10 @@ ia32_sys_call_table: | |||
825 | .quad compat_sys_signalfd4 | 825 | .quad compat_sys_signalfd4 |
826 | .quad sys_eventfd2 | 826 | .quad sys_eventfd2 |
827 | .quad sys_epoll_create1 | 827 | .quad sys_epoll_create1 |
828 | .quad sys_dup3 /* 330 */ | 828 | .quad sys_dup3 /* 330 */ |
829 | .quad sys_pipe2 | 829 | .quad sys_pipe2 |
830 | .quad sys_inotify_init1 | 830 | .quad sys_inotify_init1 |
831 | .quad compat_sys_preadv | 831 | .quad compat_sys_preadv |
832 | .quad compat_sys_pwritev | 832 | .quad compat_sys_pwritev |
833 | .quad sys_perf_counter_open | ||
833 | ia32_syscall_end: | 834 | ia32_syscall_end: |
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 85b46fba4229..aff9f1fcdcd7 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h | |||
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) | |||
247 | #define smp_mb__before_atomic_inc() barrier() | 247 | #define smp_mb__before_atomic_inc() barrier() |
248 | #define smp_mb__after_atomic_inc() barrier() | 248 | #define smp_mb__after_atomic_inc() barrier() |
249 | 249 | ||
250 | /* An 64bit atomic type */ | ||
251 | |||
252 | typedef struct { | ||
253 | unsigned long long counter; | ||
254 | } atomic64_t; | ||
255 | |||
256 | #define ATOMIC64_INIT(val) { (val) } | ||
257 | |||
258 | /** | ||
259 | * atomic64_read - read atomic64 variable | ||
260 | * @v: pointer of type atomic64_t | ||
261 | * | ||
262 | * Atomically reads the value of @v. | ||
263 | * Doesn't imply a read memory barrier. | ||
264 | */ | ||
265 | #define __atomic64_read(ptr) ((ptr)->counter) | ||
266 | |||
267 | static inline unsigned long long | ||
268 | cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) | ||
269 | { | ||
270 | asm volatile( | ||
271 | |||
272 | LOCK_PREFIX "cmpxchg8b (%[ptr])\n" | ||
273 | |||
274 | : "=A" (old) | ||
275 | |||
276 | : [ptr] "D" (ptr), | ||
277 | "A" (old), | ||
278 | "b" (ll_low(new)), | ||
279 | "c" (ll_high(new)) | ||
280 | |||
281 | : "memory"); | ||
282 | |||
283 | return old; | ||
284 | } | ||
285 | |||
286 | static inline unsigned long long | ||
287 | atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, | ||
288 | unsigned long long new_val) | ||
289 | { | ||
290 | return cmpxchg8b(&ptr->counter, old_val, new_val); | ||
291 | } | ||
292 | |||
293 | /** | ||
294 | * atomic64_xchg - xchg atomic64 variable | ||
295 | * @ptr: pointer to type atomic64_t | ||
296 | * @new_val: value to assign | ||
297 | * @old_val: old value that was there | ||
298 | * | ||
299 | * Atomically xchgs the value of @ptr to @new_val and returns | ||
300 | * the old value. | ||
301 | */ | ||
302 | |||
303 | static inline unsigned long long | ||
304 | atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) | ||
305 | { | ||
306 | unsigned long long old_val; | ||
307 | |||
308 | do { | ||
309 | old_val = atomic_read(ptr); | ||
310 | } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); | ||
311 | |||
312 | return old_val; | ||
313 | } | ||
314 | |||
315 | /** | ||
316 | * atomic64_set - set atomic64 variable | ||
317 | * @ptr: pointer to type atomic64_t | ||
318 | * @new_val: value to assign | ||
319 | * | ||
320 | * Atomically sets the value of @ptr to @new_val. | ||
321 | */ | ||
322 | static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) | ||
323 | { | ||
324 | atomic64_xchg(ptr, new_val); | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * atomic64_read - read atomic64 variable | ||
329 | * @ptr: pointer to type atomic64_t | ||
330 | * | ||
331 | * Atomically reads the value of @ptr and returns it. | ||
332 | */ | ||
333 | static inline unsigned long long atomic64_read(atomic64_t *ptr) | ||
334 | { | ||
335 | unsigned long long curr_val; | ||
336 | |||
337 | do { | ||
338 | curr_val = __atomic64_read(ptr); | ||
339 | } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); | ||
340 | |||
341 | return curr_val; | ||
342 | } | ||
343 | |||
344 | /** | ||
345 | * atomic64_add_return - add and return | ||
346 | * @delta: integer value to add | ||
347 | * @ptr: pointer to type atomic64_t | ||
348 | * | ||
349 | * Atomically adds @delta to @ptr and returns @delta + *@ptr | ||
350 | */ | ||
351 | static inline unsigned long long | ||
352 | atomic64_add_return(unsigned long long delta, atomic64_t *ptr) | ||
353 | { | ||
354 | unsigned long long old_val, new_val; | ||
355 | |||
356 | do { | ||
357 | old_val = atomic_read(ptr); | ||
358 | new_val = old_val + delta; | ||
359 | |||
360 | } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); | ||
361 | |||
362 | return new_val; | ||
363 | } | ||
364 | |||
365 | static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) | ||
366 | { | ||
367 | return atomic64_add_return(-delta, ptr); | ||
368 | } | ||
369 | |||
370 | static inline long atomic64_inc_return(atomic64_t *ptr) | ||
371 | { | ||
372 | return atomic64_add_return(1, ptr); | ||
373 | } | ||
374 | |||
375 | static inline long atomic64_dec_return(atomic64_t *ptr) | ||
376 | { | ||
377 | return atomic64_sub_return(1, ptr); | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * atomic64_add - add integer to atomic64 variable | ||
382 | * @delta: integer value to add | ||
383 | * @ptr: pointer to type atomic64_t | ||
384 | * | ||
385 | * Atomically adds @delta to @ptr. | ||
386 | */ | ||
387 | static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) | ||
388 | { | ||
389 | atomic64_add_return(delta, ptr); | ||
390 | } | ||
391 | |||
392 | /** | ||
393 | * atomic64_sub - subtract the atomic64 variable | ||
394 | * @delta: integer value to subtract | ||
395 | * @ptr: pointer to type atomic64_t | ||
396 | * | ||
397 | * Atomically subtracts @delta from @ptr. | ||
398 | */ | ||
399 | static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) | ||
400 | { | ||
401 | atomic64_add(-delta, ptr); | ||
402 | } | ||
403 | |||
404 | /** | ||
405 | * atomic64_sub_and_test - subtract value from variable and test result | ||
406 | * @delta: integer value to subtract | ||
407 | * @ptr: pointer to type atomic64_t | ||
408 | * | ||
409 | * Atomically subtracts @delta from @ptr and returns | ||
410 | * true if the result is zero, or false for all | ||
411 | * other cases. | ||
412 | */ | ||
413 | static inline int | ||
414 | atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) | ||
415 | { | ||
416 | unsigned long long old_val = atomic64_sub_return(delta, ptr); | ||
417 | |||
418 | return old_val == 0; | ||
419 | } | ||
420 | |||
421 | /** | ||
422 | * atomic64_inc - increment atomic64 variable | ||
423 | * @ptr: pointer to type atomic64_t | ||
424 | * | ||
425 | * Atomically increments @ptr by 1. | ||
426 | */ | ||
427 | static inline void atomic64_inc(atomic64_t *ptr) | ||
428 | { | ||
429 | atomic64_add(1, ptr); | ||
430 | } | ||
431 | |||
432 | /** | ||
433 | * atomic64_dec - decrement atomic64 variable | ||
434 | * @ptr: pointer to type atomic64_t | ||
435 | * | ||
436 | * Atomically decrements @ptr by 1. | ||
437 | */ | ||
438 | static inline void atomic64_dec(atomic64_t *ptr) | ||
439 | { | ||
440 | atomic64_sub(1, ptr); | ||
441 | } | ||
442 | |||
443 | /** | ||
444 | * atomic64_dec_and_test - decrement and test | ||
445 | * @ptr: pointer to type atomic64_t | ||
446 | * | ||
447 | * Atomically decrements @ptr by 1 and | ||
448 | * returns true if the result is 0, or false for all other | ||
449 | * cases. | ||
450 | */ | ||
451 | static inline int atomic64_dec_and_test(atomic64_t *ptr) | ||
452 | { | ||
453 | return atomic64_sub_and_test(1, ptr); | ||
454 | } | ||
455 | |||
456 | /** | ||
457 | * atomic64_inc_and_test - increment and test | ||
458 | * @ptr: pointer to type atomic64_t | ||
459 | * | ||
460 | * Atomically increments @ptr by 1 | ||
461 | * and returns true if the result is zero, or false for all | ||
462 | * other cases. | ||
463 | */ | ||
464 | static inline int atomic64_inc_and_test(atomic64_t *ptr) | ||
465 | { | ||
466 | return atomic64_sub_and_test(-1, ptr); | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * atomic64_add_negative - add and test if negative | ||
471 | * @delta: integer value to add | ||
472 | * @ptr: pointer to type atomic64_t | ||
473 | * | ||
474 | * Atomically adds @delta to @ptr and returns true | ||
475 | * if the result is negative, or false when | ||
476 | * result is greater than or equal to zero. | ||
477 | */ | ||
478 | static inline int | ||
479 | atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) | ||
480 | { | ||
481 | long long old_val = atomic64_add_return(delta, ptr); | ||
482 | |||
483 | return old_val < 0; | ||
484 | } | ||
485 | |||
250 | #include <asm-generic/atomic.h> | 486 | #include <asm-generic/atomic.h> |
251 | #endif /* _ASM_X86_ATOMIC_32_H */ | 487 | #endif /* _ASM_X86_ATOMIC_32_H */ |
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf258..fe24d2802490 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) | |||
50 | 50 | ||
51 | #ifdef CONFIG_PERF_COUNTERS | 51 | #ifdef CONFIG_PERF_COUNTERS |
52 | BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) | 52 | BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) |
53 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) | ||
53 | #endif | 54 | #endif |
54 | 55 | ||
55 | #ifdef CONFIG_X86_MCE_P4THERMAL | 56 | #ifdef CONFIG_X86_MCE_P4THERMAL |
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 039db6aa8e02..f5ebe2aaca4b 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h | |||
@@ -13,6 +13,8 @@ typedef struct { | |||
13 | unsigned int irq_spurious_count; | 13 | unsigned int irq_spurious_count; |
14 | #endif | 14 | #endif |
15 | unsigned int generic_irqs; /* arch dependent */ | 15 | unsigned int generic_irqs; /* arch dependent */ |
16 | unsigned int apic_perf_irqs; | ||
17 | unsigned int apic_pending_irqs; | ||
16 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
17 | unsigned int irq_resched_count; | 19 | unsigned int irq_resched_count; |
18 | unsigned int irq_call_count; | 20 | unsigned int irq_call_count; |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd70..7309c0ad6902 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -29,6 +29,9 @@ | |||
29 | extern void apic_timer_interrupt(void); | 29 | extern void apic_timer_interrupt(void); |
30 | extern void generic_interrupt(void); | 30 | extern void generic_interrupt(void); |
31 | extern void error_interrupt(void); | 31 | extern void error_interrupt(void); |
32 | extern void perf_counter_interrupt(void); | ||
33 | extern void perf_pending_interrupt(void); | ||
34 | |||
32 | extern void spurious_interrupt(void); | 35 | extern void spurious_interrupt(void); |
33 | extern void thermal_interrupt(void); | 36 | extern void thermal_interrupt(void); |
34 | extern void reschedule_interrupt(void); | 37 | extern void reschedule_interrupt(void); |
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h deleted file mode 100644 index fa0fd068bc2e..000000000000 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H | ||
2 | #define _ASM_X86_INTEL_ARCH_PERFMON_H | ||
3 | |||
4 | #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 | ||
5 | #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 | ||
6 | |||
7 | #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 | ||
8 | #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 | ||
9 | |||
10 | #define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) | ||
11 | #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) | ||
12 | #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) | ||
13 | #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) | ||
14 | |||
15 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) | ||
16 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) | ||
17 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) | ||
18 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ | ||
19 | (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) | ||
20 | |||
21 | union cpuid10_eax { | ||
22 | struct { | ||
23 | unsigned int version_id:8; | ||
24 | unsigned int num_counters:8; | ||
25 | unsigned int bit_width:8; | ||
26 | unsigned int mask_length:8; | ||
27 | } split; | ||
28 | unsigned int full; | ||
29 | }; | ||
30 | |||
31 | #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ | ||
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47c..545bb811ccb5 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
@@ -117,6 +117,11 @@ | |||
117 | #define GENERIC_INTERRUPT_VECTOR 0xed | 117 | #define GENERIC_INTERRUPT_VECTOR 0xed |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * Performance monitoring pending work vector: | ||
121 | */ | ||
122 | #define LOCAL_PENDING_VECTOR 0xec | ||
123 | |||
124 | /* | ||
120 | * First APIC vector available to drivers: (vectors 0x30-0xee) we | 125 | * First APIC vector available to drivers: (vectors 0x30-0xee) we |
121 | * start at 0x31(0x41) to spread out vectors evenly between priority | 126 | * start at 0x31(0x41) to spread out vectors evenly between priority |
122 | * levels. (0x80 is the syscall vector) | 127 | * levels. (0x80 is the syscall vector) |
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h new file mode 100644 index 000000000000..d08dd52cb8ff --- /dev/null +++ b/arch/x86/include/asm/perf_counter.h | |||
@@ -0,0 +1,100 @@ | |||
1 | #ifndef _ASM_X86_PERF_COUNTER_H | ||
2 | #define _ASM_X86_PERF_COUNTER_H | ||
3 | |||
4 | /* | ||
5 | * Performance counter hw details: | ||
6 | */ | ||
7 | |||
8 | #define X86_PMC_MAX_GENERIC 8 | ||
9 | #define X86_PMC_MAX_FIXED 3 | ||
10 | |||
11 | #define X86_PMC_IDX_GENERIC 0 | ||
12 | #define X86_PMC_IDX_FIXED 32 | ||
13 | #define X86_PMC_IDX_MAX 64 | ||
14 | |||
15 | #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 | ||
16 | #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 | ||
17 | |||
18 | #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 | ||
19 | #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 | ||
20 | |||
21 | #define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) | ||
22 | #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) | ||
23 | #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) | ||
24 | #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) | ||
25 | |||
26 | /* | ||
27 | * Includes eventsel and unit mask as well: | ||
28 | */ | ||
29 | #define ARCH_PERFMON_EVENT_MASK 0xffff | ||
30 | |||
31 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c | ||
32 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) | ||
33 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 | ||
34 | #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ | ||
35 | (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) | ||
36 | |||
37 | #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 | ||
38 | |||
39 | /* | ||
40 | * Intel "Architectural Performance Monitoring" CPUID | ||
41 | * detection/enumeration details: | ||
42 | */ | ||
43 | union cpuid10_eax { | ||
44 | struct { | ||
45 | unsigned int version_id:8; | ||
46 | unsigned int num_counters:8; | ||
47 | unsigned int bit_width:8; | ||
48 | unsigned int mask_length:8; | ||
49 | } split; | ||
50 | unsigned int full; | ||
51 | }; | ||
52 | |||
53 | union cpuid10_edx { | ||
54 | struct { | ||
55 | unsigned int num_counters_fixed:4; | ||
56 | unsigned int reserved:28; | ||
57 | } split; | ||
58 | unsigned int full; | ||
59 | }; | ||
60 | |||
61 | |||
62 | /* | ||
63 | * Fixed-purpose performance counters: | ||
64 | */ | ||
65 | |||
66 | /* | ||
67 | * All 3 fixed-mode PMCs are configured via this single MSR: | ||
68 | */ | ||
69 | #define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d | ||
70 | |||
71 | /* | ||
72 | * The counts are available in three separate MSRs: | ||
73 | */ | ||
74 | |||
75 | /* Instr_Retired.Any: */ | ||
76 | #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 | ||
77 | #define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) | ||
78 | |||
79 | /* CPU_CLK_Unhalted.Core: */ | ||
80 | #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a | ||
81 | #define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) | ||
82 | |||
83 | /* CPU_CLK_Unhalted.Ref: */ | ||
84 | #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b | ||
85 | #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) | ||
86 | |||
87 | extern void set_perf_counter_pending(void); | ||
88 | |||
89 | #define clear_perf_counter_pending() do { } while (0) | ||
90 | #define test_perf_counter_pending() (0) | ||
91 | |||
92 | #ifdef CONFIG_PERF_COUNTERS | ||
93 | extern void init_hw_perf_counters(void); | ||
94 | extern void perf_counters_lapic_init(int nmi); | ||
95 | #else | ||
96 | static inline void init_hw_perf_counters(void) { } | ||
97 | static inline void perf_counters_lapic_init(int nmi) { } | ||
98 | #endif | ||
99 | |||
100 | #endif /* _ASM_X86_PERF_COUNTER_H */ | ||
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74cf8dc..0b4d8c2b157d 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h | |||
@@ -340,6 +340,7 @@ | |||
340 | #define __NR_inotify_init1 332 | 340 | #define __NR_inotify_init1 332 |
341 | #define __NR_preadv 333 | 341 | #define __NR_preadv 333 |
342 | #define __NR_pwritev 334 | 342 | #define __NR_pwritev 334 |
343 | #define __NR_perf_counter_open 333 | ||
343 | 344 | ||
344 | #ifdef __KERNEL__ | 345 | #ifdef __KERNEL__ |
345 | 346 | ||
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index f81829462325..d9aad876ad76 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h | |||
@@ -657,7 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1) | |||
657 | __SYSCALL(__NR_preadv, sys_preadv) | 657 | __SYSCALL(__NR_preadv, sys_preadv) |
658 | #define __NR_pwritev 296 | 658 | #define __NR_pwritev 296 |
659 | __SYSCALL(__NR_pwritev, sys_pwritev) | 659 | __SYSCALL(__NR_pwritev, sys_pwritev) |
660 | 660 | #define __NR_perf_counter_open 295 | |
661 | __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) | ||
661 | 662 | ||
662 | #ifndef __NO_STUBS | 663 | #ifndef __NO_STUBS |
663 | #define __ARCH_WANT_OLD_READDIR | 664 | #define __ARCH_WANT_OLD_READDIR |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 098ec84b8c00..fb504f843e58 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | 36 | ||
37 | #include <asm/perf_counter.h> | ||
37 | #include <asm/pgalloc.h> | 38 | #include <asm/pgalloc.h> |
38 | #include <asm/atomic.h> | 39 | #include <asm/atomic.h> |
39 | #include <asm/mpspec.h> | 40 | #include <asm/mpspec.h> |
@@ -755,6 +756,8 @@ static void local_apic_timer_interrupt(void) | |||
755 | inc_irq_stat(apic_timer_irqs); | 756 | inc_irq_stat(apic_timer_irqs); |
756 | 757 | ||
757 | evt->event_handler(evt); | 758 | evt->event_handler(evt); |
759 | |||
760 | perf_counter_unthrottle(); | ||
758 | } | 761 | } |
759 | 762 | ||
760 | /* | 763 | /* |
@@ -1127,6 +1130,7 @@ void __cpuinit setup_local_APIC(void) | |||
1127 | apic_write(APIC_ESR, 0); | 1130 | apic_write(APIC_ESR, 0); |
1128 | } | 1131 | } |
1129 | #endif | 1132 | #endif |
1133 | perf_counters_lapic_init(0); | ||
1130 | 1134 | ||
1131 | preempt_disable(); | 1135 | preempt_disable(); |
1132 | 1136 | ||
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e4..3efcb2b96a15 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | # | 1 | # |
2 | # Makefile for x86-compatible CPU details and quirks | 2 | # Makefile for x86-compatible CPU details, features and quirks |
3 | # | 3 | # |
4 | 4 | ||
5 | # Don't trace early stages of a secondary CPU boot | 5 | # Don't trace early stages of a secondary CPU boot |
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | |||
23 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o | 23 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
24 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o | 24 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
25 | 25 | ||
26 | obj-$(CONFIG_X86_MCE) += mcheck/ | 26 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o |
27 | obj-$(CONFIG_MTRR) += mtrr/ | ||
28 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
29 | 27 | ||
30 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o | 28 | obj-$(CONFIG_X86_MCE) += mcheck/ |
29 | obj-$(CONFIG_MTRR) += mtrr/ | ||
30 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | ||
31 | |||
32 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o | ||
31 | 33 | ||
32 | quiet_cmd_mkcapflags = MKCAP $@ | 34 | quiet_cmd_mkcapflags = MKCAP $@ |
33 | cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ | 35 | cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa64..fd69c514ca2a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -420,6 +420,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
420 | if (c->x86 >= 6) | 420 | if (c->x86 >= 6) |
421 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | 421 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); |
422 | 422 | ||
423 | /* Enable Performance counter for K7 and later */ | ||
424 | if (c->x86 > 6 && c->x86 <= 0x11) | ||
425 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | ||
426 | |||
423 | if (!c->x86_model_id[0]) { | 427 | if (!c->x86_model_id[0]) { |
424 | switch (c->x86) { | 428 | switch (c->x86) { |
425 | case 0xf: | 429 | case 0xf: |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c4f667896c28..a86769efe0df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/io.h> | 13 | #include <linux/io.h> |
14 | 14 | ||
15 | #include <asm/stackprotector.h> | 15 | #include <asm/stackprotector.h> |
16 | #include <asm/perf_counter.h> | ||
16 | #include <asm/mmu_context.h> | 17 | #include <asm/mmu_context.h> |
17 | #include <asm/hypervisor.h> | 18 | #include <asm/hypervisor.h> |
18 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
@@ -854,6 +855,7 @@ void __init identify_boot_cpu(void) | |||
854 | #else | 855 | #else |
855 | vgetcpu_set_mode(); | 856 | vgetcpu_set_mode(); |
856 | #endif | 857 | #endif |
858 | init_hw_perf_counters(); | ||
857 | } | 859 | } |
858 | 860 | ||
859 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | 861 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 000000000000..1116a41bc7b5 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c | |||
@@ -0,0 +1,1213 @@ | |||
1 | /* | ||
2 | * Performance counter x86 architecture code | ||
3 | * | ||
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright(C) 2009 Jaswinder Singh Rajput | ||
7 | * | ||
8 | * For licencing details see kernel-base/COPYING | ||
9 | */ | ||
10 | |||
11 | #include <linux/perf_counter.h> | ||
12 | #include <linux/capability.h> | ||
13 | #include <linux/notifier.h> | ||
14 | #include <linux/hardirq.h> | ||
15 | #include <linux/kprobes.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/kdebug.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/uaccess.h> | ||
20 | |||
21 | #include <asm/apic.h> | ||
22 | #include <asm/stacktrace.h> | ||
23 | #include <asm/nmi.h> | ||
24 | |||
25 | static bool perf_counters_initialized __read_mostly; | ||
26 | |||
27 | /* | ||
28 | * Number of (generic) HW counters: | ||
29 | */ | ||
30 | static int nr_counters_generic __read_mostly; | ||
31 | static u64 perf_counter_mask __read_mostly; | ||
32 | static u64 counter_value_mask __read_mostly; | ||
33 | static int counter_value_bits __read_mostly; | ||
34 | |||
35 | static int nr_counters_fixed __read_mostly; | ||
36 | |||
37 | struct cpu_hw_counters { | ||
38 | struct perf_counter *counters[X86_PMC_IDX_MAX]; | ||
39 | unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
40 | unsigned long interrupts; | ||
41 | u64 throttle_ctrl; | ||
42 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | ||
43 | int enabled; | ||
44 | }; | ||
45 | |||
46 | /* | ||
47 | * struct pmc_x86_ops - performance counter x86 ops | ||
48 | */ | ||
49 | struct pmc_x86_ops { | ||
50 | u64 (*save_disable_all)(void); | ||
51 | void (*restore_all)(u64); | ||
52 | u64 (*get_status)(u64); | ||
53 | void (*ack_status)(u64); | ||
54 | void (*enable)(int, u64); | ||
55 | void (*disable)(int, u64); | ||
56 | unsigned eventsel; | ||
57 | unsigned perfctr; | ||
58 | u64 (*event_map)(int); | ||
59 | u64 (*raw_event)(u64); | ||
60 | int max_events; | ||
61 | }; | ||
62 | |||
63 | static struct pmc_x86_ops *pmc_ops __read_mostly; | ||
64 | |||
65 | static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { | ||
66 | .enabled = 1, | ||
67 | }; | ||
68 | |||
69 | static __read_mostly int intel_perfmon_version; | ||
70 | |||
71 | /* | ||
72 | * Intel PerfMon v3. Used on Core2 and later. | ||
73 | */ | ||
74 | static const u64 intel_perfmon_event_map[] = | ||
75 | { | ||
76 | [PERF_COUNT_CPU_CYCLES] = 0x003c, | ||
77 | [PERF_COUNT_INSTRUCTIONS] = 0x00c0, | ||
78 | [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, | ||
79 | [PERF_COUNT_CACHE_MISSES] = 0x412e, | ||
80 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
81 | [PERF_COUNT_BRANCH_MISSES] = 0x00c5, | ||
82 | [PERF_COUNT_BUS_CYCLES] = 0x013c, | ||
83 | }; | ||
84 | |||
85 | static u64 pmc_intel_event_map(int event) | ||
86 | { | ||
87 | return intel_perfmon_event_map[event]; | ||
88 | } | ||
89 | |||
90 | static u64 pmc_intel_raw_event(u64 event) | ||
91 | { | ||
92 | #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
93 | #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
94 | #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL | ||
95 | |||
96 | #define CORE_EVNTSEL_MASK \ | ||
97 | (CORE_EVNTSEL_EVENT_MASK | \ | ||
98 | CORE_EVNTSEL_UNIT_MASK | \ | ||
99 | CORE_EVNTSEL_COUNTER_MASK) | ||
100 | |||
101 | return event & CORE_EVNTSEL_MASK; | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * AMD Performance Monitor K7 and later. | ||
106 | */ | ||
107 | static const u64 amd_perfmon_event_map[] = | ||
108 | { | ||
109 | [PERF_COUNT_CPU_CYCLES] = 0x0076, | ||
110 | [PERF_COUNT_INSTRUCTIONS] = 0x00c0, | ||
111 | [PERF_COUNT_CACHE_REFERENCES] = 0x0080, | ||
112 | [PERF_COUNT_CACHE_MISSES] = 0x0081, | ||
113 | [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
114 | [PERF_COUNT_BRANCH_MISSES] = 0x00c5, | ||
115 | }; | ||
116 | |||
117 | static u64 pmc_amd_event_map(int event) | ||
118 | { | ||
119 | return amd_perfmon_event_map[event]; | ||
120 | } | ||
121 | |||
122 | static u64 pmc_amd_raw_event(u64 event) | ||
123 | { | ||
124 | #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL | ||
125 | #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL | ||
126 | #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL | ||
127 | |||
128 | #define K7_EVNTSEL_MASK \ | ||
129 | (K7_EVNTSEL_EVENT_MASK | \ | ||
130 | K7_EVNTSEL_UNIT_MASK | \ | ||
131 | K7_EVNTSEL_COUNTER_MASK) | ||
132 | |||
133 | return event & K7_EVNTSEL_MASK; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Propagate counter elapsed time into the generic counter. | ||
138 | * Can only be executed on the CPU where the counter is active. | ||
139 | * Returns the delta events processed. | ||
140 | */ | ||
141 | static void | ||
142 | x86_perf_counter_update(struct perf_counter *counter, | ||
143 | struct hw_perf_counter *hwc, int idx) | ||
144 | { | ||
145 | u64 prev_raw_count, new_raw_count, delta; | ||
146 | |||
147 | /* | ||
148 | * Careful: an NMI might modify the previous counter value. | ||
149 | * | ||
150 | * Our tactic to handle this is to first atomically read and | ||
151 | * exchange a new raw count - then add that new-prev delta | ||
152 | * count to the generic counter atomically: | ||
153 | */ | ||
154 | again: | ||
155 | prev_raw_count = atomic64_read(&hwc->prev_count); | ||
156 | rdmsrl(hwc->counter_base + idx, new_raw_count); | ||
157 | |||
158 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | ||
159 | new_raw_count) != prev_raw_count) | ||
160 | goto again; | ||
161 | |||
162 | /* | ||
163 | * Now we have the new raw value and have updated the prev | ||
164 | * timestamp already. We can now calculate the elapsed delta | ||
165 | * (counter-)time and add that to the generic counter. | ||
166 | * | ||
167 | * Careful, not all hw sign-extends above the physical width | ||
168 | * of the count, so we do that by clipping the delta to 32 bits: | ||
169 | */ | ||
170 | delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); | ||
171 | |||
172 | atomic64_add(delta, &counter->count); | ||
173 | atomic64_sub(delta, &hwc->period_left); | ||
174 | } | ||
175 | |||
176 | static atomic_t num_counters; | ||
177 | static DEFINE_MUTEX(pmc_reserve_mutex); | ||
178 | |||
179 | static bool reserve_pmc_hardware(void) | ||
180 | { | ||
181 | int i; | ||
182 | |||
183 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
184 | disable_lapic_nmi_watchdog(); | ||
185 | |||
186 | for (i = 0; i < nr_counters_generic; i++) { | ||
187 | if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) | ||
188 | goto perfctr_fail; | ||
189 | } | ||
190 | |||
191 | for (i = 0; i < nr_counters_generic; i++) { | ||
192 | if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) | ||
193 | goto eventsel_fail; | ||
194 | } | ||
195 | |||
196 | return true; | ||
197 | |||
198 | eventsel_fail: | ||
199 | for (i--; i >= 0; i--) | ||
200 | release_evntsel_nmi(pmc_ops->eventsel + i); | ||
201 | |||
202 | i = nr_counters_generic; | ||
203 | |||
204 | perfctr_fail: | ||
205 | for (i--; i >= 0; i--) | ||
206 | release_perfctr_nmi(pmc_ops->perfctr + i); | ||
207 | |||
208 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
209 | enable_lapic_nmi_watchdog(); | ||
210 | |||
211 | return false; | ||
212 | } | ||
213 | |||
214 | static void release_pmc_hardware(void) | ||
215 | { | ||
216 | int i; | ||
217 | |||
218 | for (i = 0; i < nr_counters_generic; i++) { | ||
219 | release_perfctr_nmi(pmc_ops->perfctr + i); | ||
220 | release_evntsel_nmi(pmc_ops->eventsel + i); | ||
221 | } | ||
222 | |||
223 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
224 | enable_lapic_nmi_watchdog(); | ||
225 | } | ||
226 | |||
227 | static void hw_perf_counter_destroy(struct perf_counter *counter) | ||
228 | { | ||
229 | if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { | ||
230 | release_pmc_hardware(); | ||
231 | mutex_unlock(&pmc_reserve_mutex); | ||
232 | } | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * Setup the hardware configuration for a given hw_event_type | ||
237 | */ | ||
238 | static int __hw_perf_counter_init(struct perf_counter *counter) | ||
239 | { | ||
240 | struct perf_counter_hw_event *hw_event = &counter->hw_event; | ||
241 | struct hw_perf_counter *hwc = &counter->hw; | ||
242 | int err; | ||
243 | |||
244 | if (unlikely(!perf_counters_initialized)) | ||
245 | return -EINVAL; | ||
246 | |||
247 | err = 0; | ||
248 | if (atomic_inc_not_zero(&num_counters)) { | ||
249 | mutex_lock(&pmc_reserve_mutex); | ||
250 | if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) | ||
251 | err = -EBUSY; | ||
252 | else | ||
253 | atomic_inc(&num_counters); | ||
254 | mutex_unlock(&pmc_reserve_mutex); | ||
255 | } | ||
256 | if (err) | ||
257 | return err; | ||
258 | |||
259 | /* | ||
260 | * Generate PMC IRQs: | ||
261 | * (keep 'enabled' bit clear for now) | ||
262 | */ | ||
263 | hwc->config = ARCH_PERFMON_EVENTSEL_INT; | ||
264 | |||
265 | /* | ||
266 | * Count user and OS events unless requested not to. | ||
267 | */ | ||
268 | if (!hw_event->exclude_user) | ||
269 | hwc->config |= ARCH_PERFMON_EVENTSEL_USR; | ||
270 | if (!hw_event->exclude_kernel) | ||
271 | hwc->config |= ARCH_PERFMON_EVENTSEL_OS; | ||
272 | |||
273 | /* | ||
274 | * If privileged enough, allow NMI events: | ||
275 | */ | ||
276 | hwc->nmi = 0; | ||
277 | if (capable(CAP_SYS_ADMIN) && hw_event->nmi) | ||
278 | hwc->nmi = 1; | ||
279 | |||
280 | hwc->irq_period = hw_event->irq_period; | ||
281 | /* | ||
282 | * Intel PMCs cannot be accessed sanely above 32 bit width, | ||
283 | * so we install an artificial 1<<31 period regardless of | ||
284 | * the generic counter period: | ||
285 | */ | ||
286 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
287 | if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) | ||
288 | hwc->irq_period = 0x7FFFFFFF; | ||
289 | |||
290 | atomic64_set(&hwc->period_left, hwc->irq_period); | ||
291 | |||
292 | /* | ||
293 | * Raw event type provide the config in the event structure | ||
294 | */ | ||
295 | if (perf_event_raw(hw_event)) { | ||
296 | hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); | ||
297 | } else { | ||
298 | if (perf_event_id(hw_event) >= pmc_ops->max_events) | ||
299 | return -EINVAL; | ||
300 | /* | ||
301 | * The generic map: | ||
302 | */ | ||
303 | hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); | ||
304 | } | ||
305 | |||
306 | counter->destroy = hw_perf_counter_destroy; | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | static u64 pmc_intel_save_disable_all(void) | ||
312 | { | ||
313 | u64 ctrl; | ||
314 | |||
315 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); | ||
316 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | ||
317 | |||
318 | return ctrl; | ||
319 | } | ||
320 | |||
321 | static u64 pmc_amd_save_disable_all(void) | ||
322 | { | ||
323 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
324 | int enabled, idx; | ||
325 | |||
326 | enabled = cpuc->enabled; | ||
327 | cpuc->enabled = 0; | ||
328 | /* | ||
329 | * ensure we write the disable before we start disabling the | ||
330 | * counters proper, so that pcm_amd_enable() does the right thing. | ||
331 | */ | ||
332 | barrier(); | ||
333 | |||
334 | for (idx = 0; idx < nr_counters_generic; idx++) { | ||
335 | u64 val; | ||
336 | |||
337 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
338 | if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { | ||
339 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
340 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
341 | } | ||
342 | } | ||
343 | |||
344 | return enabled; | ||
345 | } | ||
346 | |||
347 | u64 hw_perf_save_disable(void) | ||
348 | { | ||
349 | if (unlikely(!perf_counters_initialized)) | ||
350 | return 0; | ||
351 | |||
352 | return pmc_ops->save_disable_all(); | ||
353 | } | ||
354 | /* | ||
355 | * Exported because of ACPI idle | ||
356 | */ | ||
357 | EXPORT_SYMBOL_GPL(hw_perf_save_disable); | ||
358 | |||
359 | static void pmc_intel_restore_all(u64 ctrl) | ||
360 | { | ||
361 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); | ||
362 | } | ||
363 | |||
364 | static void pmc_amd_restore_all(u64 ctrl) | ||
365 | { | ||
366 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
367 | int idx; | ||
368 | |||
369 | cpuc->enabled = ctrl; | ||
370 | barrier(); | ||
371 | if (!ctrl) | ||
372 | return; | ||
373 | |||
374 | for (idx = 0; idx < nr_counters_generic; idx++) { | ||
375 | if (test_bit(idx, cpuc->active_mask)) { | ||
376 | u64 val; | ||
377 | |||
378 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
379 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
380 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | |||
385 | void hw_perf_restore(u64 ctrl) | ||
386 | { | ||
387 | if (unlikely(!perf_counters_initialized)) | ||
388 | return; | ||
389 | |||
390 | pmc_ops->restore_all(ctrl); | ||
391 | } | ||
392 | /* | ||
393 | * Exported because of ACPI idle | ||
394 | */ | ||
395 | EXPORT_SYMBOL_GPL(hw_perf_restore); | ||
396 | |||
397 | static u64 pmc_intel_get_status(u64 mask) | ||
398 | { | ||
399 | u64 status; | ||
400 | |||
401 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | ||
402 | |||
403 | return status; | ||
404 | } | ||
405 | |||
406 | static u64 pmc_amd_get_status(u64 mask) | ||
407 | { | ||
408 | u64 status = 0; | ||
409 | int idx; | ||
410 | |||
411 | for (idx = 0; idx < nr_counters_generic; idx++) { | ||
412 | s64 val; | ||
413 | |||
414 | if (!(mask & (1 << idx))) | ||
415 | continue; | ||
416 | |||
417 | rdmsrl(MSR_K7_PERFCTR0 + idx, val); | ||
418 | val <<= (64 - counter_value_bits); | ||
419 | if (val >= 0) | ||
420 | status |= (1 << idx); | ||
421 | } | ||
422 | |||
423 | return status; | ||
424 | } | ||
425 | |||
426 | static u64 hw_perf_get_status(u64 mask) | ||
427 | { | ||
428 | if (unlikely(!perf_counters_initialized)) | ||
429 | return 0; | ||
430 | |||
431 | return pmc_ops->get_status(mask); | ||
432 | } | ||
433 | |||
434 | static void pmc_intel_ack_status(u64 ack) | ||
435 | { | ||
436 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); | ||
437 | } | ||
438 | |||
439 | static void pmc_amd_ack_status(u64 ack) | ||
440 | { | ||
441 | } | ||
442 | |||
443 | static void hw_perf_ack_status(u64 ack) | ||
444 | { | ||
445 | if (unlikely(!perf_counters_initialized)) | ||
446 | return; | ||
447 | |||
448 | pmc_ops->ack_status(ack); | ||
449 | } | ||
450 | |||
451 | static void pmc_intel_enable(int idx, u64 config) | ||
452 | { | ||
453 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, | ||
454 | config | ARCH_PERFMON_EVENTSEL0_ENABLE); | ||
455 | } | ||
456 | |||
457 | static void pmc_amd_enable(int idx, u64 config) | ||
458 | { | ||
459 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
460 | |||
461 | set_bit(idx, cpuc->active_mask); | ||
462 | if (cpuc->enabled) | ||
463 | config |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
464 | |||
465 | wrmsrl(MSR_K7_EVNTSEL0 + idx, config); | ||
466 | } | ||
467 | |||
468 | static void hw_perf_enable(int idx, u64 config) | ||
469 | { | ||
470 | if (unlikely(!perf_counters_initialized)) | ||
471 | return; | ||
472 | |||
473 | pmc_ops->enable(idx, config); | ||
474 | } | ||
475 | |||
476 | static void pmc_intel_disable(int idx, u64 config) | ||
477 | { | ||
478 | wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); | ||
479 | } | ||
480 | |||
481 | static void pmc_amd_disable(int idx, u64 config) | ||
482 | { | ||
483 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
484 | |||
485 | clear_bit(idx, cpuc->active_mask); | ||
486 | wrmsrl(MSR_K7_EVNTSEL0 + idx, config); | ||
487 | |||
488 | } | ||
489 | |||
490 | static void hw_perf_disable(int idx, u64 config) | ||
491 | { | ||
492 | if (unlikely(!perf_counters_initialized)) | ||
493 | return; | ||
494 | |||
495 | pmc_ops->disable(idx, config); | ||
496 | } | ||
497 | |||
498 | static inline void | ||
499 | __pmc_fixed_disable(struct perf_counter *counter, | ||
500 | struct hw_perf_counter *hwc, unsigned int __idx) | ||
501 | { | ||
502 | int idx = __idx - X86_PMC_IDX_FIXED; | ||
503 | u64 ctrl_val, mask; | ||
504 | int err; | ||
505 | |||
506 | mask = 0xfULL << (idx * 4); | ||
507 | |||
508 | rdmsrl(hwc->config_base, ctrl_val); | ||
509 | ctrl_val &= ~mask; | ||
510 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | ||
511 | } | ||
512 | |||
513 | static inline void | ||
514 | __pmc_generic_disable(struct perf_counter *counter, | ||
515 | struct hw_perf_counter *hwc, unsigned int idx) | ||
516 | { | ||
517 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) | ||
518 | __pmc_fixed_disable(counter, hwc, idx); | ||
519 | else | ||
520 | hw_perf_disable(idx, hwc->config); | ||
521 | } | ||
522 | |||
523 | static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); | ||
524 | |||
525 | /* | ||
526 | * Set the next IRQ period, based on the hwc->period_left value. | ||
527 | * To be called with the counter disabled in hw: | ||
528 | */ | ||
529 | static void | ||
530 | __hw_perf_counter_set_period(struct perf_counter *counter, | ||
531 | struct hw_perf_counter *hwc, int idx) | ||
532 | { | ||
533 | s64 left = atomic64_read(&hwc->period_left); | ||
534 | s64 period = hwc->irq_period; | ||
535 | int err; | ||
536 | |||
537 | /* | ||
538 | * If we are way outside a reasoable range then just skip forward: | ||
539 | */ | ||
540 | if (unlikely(left <= -period)) { | ||
541 | left = period; | ||
542 | atomic64_set(&hwc->period_left, left); | ||
543 | } | ||
544 | |||
545 | if (unlikely(left <= 0)) { | ||
546 | left += period; | ||
547 | atomic64_set(&hwc->period_left, left); | ||
548 | } | ||
549 | |||
550 | per_cpu(prev_left[idx], smp_processor_id()) = left; | ||
551 | |||
552 | /* | ||
553 | * The hw counter starts counting from this counter offset, | ||
554 | * mark it to be able to extra future deltas: | ||
555 | */ | ||
556 | atomic64_set(&hwc->prev_count, (u64)-left); | ||
557 | |||
558 | err = checking_wrmsrl(hwc->counter_base + idx, | ||
559 | (u64)(-left) & counter_value_mask); | ||
560 | } | ||
561 | |||
562 | static inline void | ||
563 | __pmc_fixed_enable(struct perf_counter *counter, | ||
564 | struct hw_perf_counter *hwc, unsigned int __idx) | ||
565 | { | ||
566 | int idx = __idx - X86_PMC_IDX_FIXED; | ||
567 | u64 ctrl_val, bits, mask; | ||
568 | int err; | ||
569 | |||
570 | /* | ||
571 | * Enable IRQ generation (0x8), | ||
572 | * and enable ring-3 counting (0x2) and ring-0 counting (0x1) | ||
573 | * if requested: | ||
574 | */ | ||
575 | bits = 0x8ULL; | ||
576 | if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) | ||
577 | bits |= 0x2; | ||
578 | if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) | ||
579 | bits |= 0x1; | ||
580 | bits <<= (idx * 4); | ||
581 | mask = 0xfULL << (idx * 4); | ||
582 | |||
583 | rdmsrl(hwc->config_base, ctrl_val); | ||
584 | ctrl_val &= ~mask; | ||
585 | ctrl_val |= bits; | ||
586 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | ||
587 | } | ||
588 | |||
589 | static void | ||
590 | __pmc_generic_enable(struct perf_counter *counter, | ||
591 | struct hw_perf_counter *hwc, int idx) | ||
592 | { | ||
593 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) | ||
594 | __pmc_fixed_enable(counter, hwc, idx); | ||
595 | else | ||
596 | hw_perf_enable(idx, hwc->config); | ||
597 | } | ||
598 | |||
599 | static int | ||
600 | fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) | ||
601 | { | ||
602 | unsigned int event; | ||
603 | |||
604 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
605 | return -1; | ||
606 | |||
607 | if (unlikely(hwc->nmi)) | ||
608 | return -1; | ||
609 | |||
610 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; | ||
611 | |||
612 | if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) | ||
613 | return X86_PMC_IDX_FIXED_INSTRUCTIONS; | ||
614 | if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) | ||
615 | return X86_PMC_IDX_FIXED_CPU_CYCLES; | ||
616 | if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) | ||
617 | return X86_PMC_IDX_FIXED_BUS_CYCLES; | ||
618 | |||
619 | return -1; | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Find a PMC slot for the freshly enabled / scheduled in counter: | ||
624 | */ | ||
625 | static int pmc_generic_enable(struct perf_counter *counter) | ||
626 | { | ||
627 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
628 | struct hw_perf_counter *hwc = &counter->hw; | ||
629 | int idx; | ||
630 | |||
631 | idx = fixed_mode_idx(counter, hwc); | ||
632 | if (idx >= 0) { | ||
633 | /* | ||
634 | * Try to get the fixed counter, if that is already taken | ||
635 | * then try to get a generic counter: | ||
636 | */ | ||
637 | if (test_and_set_bit(idx, cpuc->used)) | ||
638 | goto try_generic; | ||
639 | |||
640 | hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; | ||
641 | /* | ||
642 | * We set it so that counter_base + idx in wrmsr/rdmsr maps to | ||
643 | * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: | ||
644 | */ | ||
645 | hwc->counter_base = | ||
646 | MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; | ||
647 | hwc->idx = idx; | ||
648 | } else { | ||
649 | idx = hwc->idx; | ||
650 | /* Try to get the previous generic counter again */ | ||
651 | if (test_and_set_bit(idx, cpuc->used)) { | ||
652 | try_generic: | ||
653 | idx = find_first_zero_bit(cpuc->used, nr_counters_generic); | ||
654 | if (idx == nr_counters_generic) | ||
655 | return -EAGAIN; | ||
656 | |||
657 | set_bit(idx, cpuc->used); | ||
658 | hwc->idx = idx; | ||
659 | } | ||
660 | hwc->config_base = pmc_ops->eventsel; | ||
661 | hwc->counter_base = pmc_ops->perfctr; | ||
662 | } | ||
663 | |||
664 | perf_counters_lapic_init(hwc->nmi); | ||
665 | |||
666 | __pmc_generic_disable(counter, hwc, idx); | ||
667 | |||
668 | cpuc->counters[idx] = counter; | ||
669 | /* | ||
670 | * Make it visible before enabling the hw: | ||
671 | */ | ||
672 | smp_wmb(); | ||
673 | |||
674 | __hw_perf_counter_set_period(counter, hwc, idx); | ||
675 | __pmc_generic_enable(counter, hwc, idx); | ||
676 | |||
677 | return 0; | ||
678 | } | ||
679 | |||
680 | void perf_counter_print_debug(void) | ||
681 | { | ||
682 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; | ||
683 | struct cpu_hw_counters *cpuc; | ||
684 | int cpu, idx; | ||
685 | |||
686 | if (!nr_counters_generic) | ||
687 | return; | ||
688 | |||
689 | local_irq_disable(); | ||
690 | |||
691 | cpu = smp_processor_id(); | ||
692 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
693 | |||
694 | if (intel_perfmon_version >= 2) { | ||
695 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); | ||
696 | rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); | ||
697 | rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); | ||
698 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); | ||
699 | |||
700 | pr_info("\n"); | ||
701 | pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); | ||
702 | pr_info("CPU#%d: status: %016llx\n", cpu, status); | ||
703 | pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); | ||
704 | pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); | ||
705 | } | ||
706 | pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); | ||
707 | |||
708 | for (idx = 0; idx < nr_counters_generic; idx++) { | ||
709 | rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); | ||
710 | rdmsrl(pmc_ops->perfctr + idx, pmc_count); | ||
711 | |||
712 | prev_left = per_cpu(prev_left[idx], cpu); | ||
713 | |||
714 | pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", | ||
715 | cpu, idx, pmc_ctrl); | ||
716 | pr_info("CPU#%d: gen-PMC%d count: %016llx\n", | ||
717 | cpu, idx, pmc_count); | ||
718 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", | ||
719 | cpu, idx, prev_left); | ||
720 | } | ||
721 | for (idx = 0; idx < nr_counters_fixed; idx++) { | ||
722 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); | ||
723 | |||
724 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", | ||
725 | cpu, idx, pmc_count); | ||
726 | } | ||
727 | local_irq_enable(); | ||
728 | } | ||
729 | |||
730 | static void pmc_generic_disable(struct perf_counter *counter) | ||
731 | { | ||
732 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
733 | struct hw_perf_counter *hwc = &counter->hw; | ||
734 | unsigned int idx = hwc->idx; | ||
735 | |||
736 | __pmc_generic_disable(counter, hwc, idx); | ||
737 | |||
738 | clear_bit(idx, cpuc->used); | ||
739 | cpuc->counters[idx] = NULL; | ||
740 | /* | ||
741 | * Make sure the cleared pointer becomes visible before we | ||
742 | * (potentially) free the counter: | ||
743 | */ | ||
744 | smp_wmb(); | ||
745 | |||
746 | /* | ||
747 | * Drain the remaining delta count out of a counter | ||
748 | * that we are disabling: | ||
749 | */ | ||
750 | x86_perf_counter_update(counter, hwc, idx); | ||
751 | } | ||
752 | |||
753 | /* | ||
754 | * Save and restart an expired counter. Called by NMI contexts, | ||
755 | * so it has to be careful about preempting normal counter ops: | ||
756 | */ | ||
757 | static void perf_save_and_restart(struct perf_counter *counter) | ||
758 | { | ||
759 | struct hw_perf_counter *hwc = &counter->hw; | ||
760 | int idx = hwc->idx; | ||
761 | |||
762 | x86_perf_counter_update(counter, hwc, idx); | ||
763 | __hw_perf_counter_set_period(counter, hwc, idx); | ||
764 | |||
765 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) | ||
766 | __pmc_generic_enable(counter, hwc, idx); | ||
767 | } | ||
768 | |||
769 | /* | ||
770 | * Maximum interrupt frequency of 100KHz per CPU | ||
771 | */ | ||
772 | #define PERFMON_MAX_INTERRUPTS (100000/HZ) | ||
773 | |||
774 | /* | ||
775 | * This handler is triggered by the local APIC, so the APIC IRQ handling | ||
776 | * rules apply: | ||
777 | */ | ||
778 | static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) | ||
779 | { | ||
780 | int bit, cpu = smp_processor_id(); | ||
781 | u64 ack, status; | ||
782 | struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
783 | int ret = 0; | ||
784 | |||
785 | cpuc->throttle_ctrl = hw_perf_save_disable(); | ||
786 | |||
787 | status = hw_perf_get_status(cpuc->throttle_ctrl); | ||
788 | if (!status) | ||
789 | goto out; | ||
790 | |||
791 | ret = 1; | ||
792 | again: | ||
793 | inc_irq_stat(apic_perf_irqs); | ||
794 | ack = status; | ||
795 | for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { | ||
796 | struct perf_counter *counter = cpuc->counters[bit]; | ||
797 | |||
798 | clear_bit(bit, (unsigned long *) &status); | ||
799 | if (!counter) | ||
800 | continue; | ||
801 | |||
802 | perf_save_and_restart(counter); | ||
803 | if (perf_counter_overflow(counter, nmi, regs)) | ||
804 | __pmc_generic_disable(counter, &counter->hw, bit); | ||
805 | } | ||
806 | |||
807 | hw_perf_ack_status(ack); | ||
808 | |||
809 | /* | ||
810 | * Repeat if there is more work to be done: | ||
811 | */ | ||
812 | status = hw_perf_get_status(cpuc->throttle_ctrl); | ||
813 | if (status) | ||
814 | goto again; | ||
815 | out: | ||
816 | /* | ||
817 | * Restore - do not reenable when global enable is off or throttled: | ||
818 | */ | ||
819 | if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) | ||
820 | hw_perf_restore(cpuc->throttle_ctrl); | ||
821 | |||
822 | return ret; | ||
823 | } | ||
824 | |||
825 | void perf_counter_unthrottle(void) | ||
826 | { | ||
827 | struct cpu_hw_counters *cpuc; | ||
828 | |||
829 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | ||
830 | return; | ||
831 | |||
832 | if (unlikely(!perf_counters_initialized)) | ||
833 | return; | ||
834 | |||
835 | cpuc = &__get_cpu_var(cpu_hw_counters); | ||
836 | if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { | ||
837 | if (printk_ratelimit()) | ||
838 | printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); | ||
839 | hw_perf_restore(cpuc->throttle_ctrl); | ||
840 | } | ||
841 | cpuc->interrupts = 0; | ||
842 | } | ||
843 | |||
844 | void smp_perf_counter_interrupt(struct pt_regs *regs) | ||
845 | { | ||
846 | irq_enter(); | ||
847 | apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); | ||
848 | ack_APIC_irq(); | ||
849 | __smp_perf_counter_interrupt(regs, 0); | ||
850 | irq_exit(); | ||
851 | } | ||
852 | |||
853 | void smp_perf_pending_interrupt(struct pt_regs *regs) | ||
854 | { | ||
855 | irq_enter(); | ||
856 | ack_APIC_irq(); | ||
857 | inc_irq_stat(apic_pending_irqs); | ||
858 | perf_counter_do_pending(); | ||
859 | irq_exit(); | ||
860 | } | ||
861 | |||
862 | void set_perf_counter_pending(void) | ||
863 | { | ||
864 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | ||
865 | } | ||
866 | |||
867 | void perf_counters_lapic_init(int nmi) | ||
868 | { | ||
869 | u32 apic_val; | ||
870 | |||
871 | if (!perf_counters_initialized) | ||
872 | return; | ||
873 | /* | ||
874 | * Enable the performance counter vector in the APIC LVT: | ||
875 | */ | ||
876 | apic_val = apic_read(APIC_LVTERR); | ||
877 | |||
878 | apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); | ||
879 | if (nmi) | ||
880 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
881 | else | ||
882 | apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); | ||
883 | apic_write(APIC_LVTERR, apic_val); | ||
884 | } | ||
885 | |||
886 | static int __kprobes | ||
887 | perf_counter_nmi_handler(struct notifier_block *self, | ||
888 | unsigned long cmd, void *__args) | ||
889 | { | ||
890 | struct die_args *args = __args; | ||
891 | struct pt_regs *regs; | ||
892 | int ret; | ||
893 | |||
894 | switch (cmd) { | ||
895 | case DIE_NMI: | ||
896 | case DIE_NMI_IPI: | ||
897 | break; | ||
898 | |||
899 | default: | ||
900 | return NOTIFY_DONE; | ||
901 | } | ||
902 | |||
903 | regs = args->regs; | ||
904 | |||
905 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
906 | ret = __smp_perf_counter_interrupt(regs, 1); | ||
907 | |||
908 | return ret ? NOTIFY_STOP : NOTIFY_OK; | ||
909 | } | ||
910 | |||
911 | static __read_mostly struct notifier_block perf_counter_nmi_notifier = { | ||
912 | .notifier_call = perf_counter_nmi_handler, | ||
913 | .next = NULL, | ||
914 | .priority = 1 | ||
915 | }; | ||
916 | |||
917 | static struct pmc_x86_ops pmc_intel_ops = { | ||
918 | .save_disable_all = pmc_intel_save_disable_all, | ||
919 | .restore_all = pmc_intel_restore_all, | ||
920 | .get_status = pmc_intel_get_status, | ||
921 | .ack_status = pmc_intel_ack_status, | ||
922 | .enable = pmc_intel_enable, | ||
923 | .disable = pmc_intel_disable, | ||
924 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, | ||
925 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, | ||
926 | .event_map = pmc_intel_event_map, | ||
927 | .raw_event = pmc_intel_raw_event, | ||
928 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | ||
929 | }; | ||
930 | |||
931 | static struct pmc_x86_ops pmc_amd_ops = { | ||
932 | .save_disable_all = pmc_amd_save_disable_all, | ||
933 | .restore_all = pmc_amd_restore_all, | ||
934 | .get_status = pmc_amd_get_status, | ||
935 | .ack_status = pmc_amd_ack_status, | ||
936 | .enable = pmc_amd_enable, | ||
937 | .disable = pmc_amd_disable, | ||
938 | .eventsel = MSR_K7_EVNTSEL0, | ||
939 | .perfctr = MSR_K7_PERFCTR0, | ||
940 | .event_map = pmc_amd_event_map, | ||
941 | .raw_event = pmc_amd_raw_event, | ||
942 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), | ||
943 | }; | ||
944 | |||
945 | static struct pmc_x86_ops *pmc_intel_init(void) | ||
946 | { | ||
947 | union cpuid10_edx edx; | ||
948 | union cpuid10_eax eax; | ||
949 | unsigned int unused; | ||
950 | unsigned int ebx; | ||
951 | |||
952 | /* | ||
953 | * Check whether the Architectural PerfMon supports | ||
954 | * Branch Misses Retired Event or not. | ||
955 | */ | ||
956 | cpuid(10, &eax.full, &ebx, &unused, &edx.full); | ||
957 | if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) | ||
958 | return NULL; | ||
959 | |||
960 | intel_perfmon_version = eax.split.version_id; | ||
961 | if (intel_perfmon_version < 2) | ||
962 | return NULL; | ||
963 | |||
964 | pr_info("Intel Performance Monitoring support detected.\n"); | ||
965 | pr_info("... version: %d\n", intel_perfmon_version); | ||
966 | pr_info("... bit width: %d\n", eax.split.bit_width); | ||
967 | pr_info("... mask length: %d\n", eax.split.mask_length); | ||
968 | |||
969 | nr_counters_generic = eax.split.num_counters; | ||
970 | nr_counters_fixed = edx.split.num_counters_fixed; | ||
971 | counter_value_mask = (1ULL << eax.split.bit_width) - 1; | ||
972 | |||
973 | return &pmc_intel_ops; | ||
974 | } | ||
975 | |||
976 | static struct pmc_x86_ops *pmc_amd_init(void) | ||
977 | { | ||
978 | nr_counters_generic = 4; | ||
979 | nr_counters_fixed = 0; | ||
980 | counter_value_mask = 0x0000FFFFFFFFFFFFULL; | ||
981 | counter_value_bits = 48; | ||
982 | |||
983 | pr_info("AMD Performance Monitoring support detected.\n"); | ||
984 | |||
985 | return &pmc_amd_ops; | ||
986 | } | ||
987 | |||
988 | void __init init_hw_perf_counters(void) | ||
989 | { | ||
990 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | ||
991 | return; | ||
992 | |||
993 | switch (boot_cpu_data.x86_vendor) { | ||
994 | case X86_VENDOR_INTEL: | ||
995 | pmc_ops = pmc_intel_init(); | ||
996 | break; | ||
997 | case X86_VENDOR_AMD: | ||
998 | pmc_ops = pmc_amd_init(); | ||
999 | break; | ||
1000 | } | ||
1001 | if (!pmc_ops) | ||
1002 | return; | ||
1003 | |||
1004 | pr_info("... num counters: %d\n", nr_counters_generic); | ||
1005 | if (nr_counters_generic > X86_PMC_MAX_GENERIC) { | ||
1006 | nr_counters_generic = X86_PMC_MAX_GENERIC; | ||
1007 | WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", | ||
1008 | nr_counters_generic, X86_PMC_MAX_GENERIC); | ||
1009 | } | ||
1010 | perf_counter_mask = (1 << nr_counters_generic) - 1; | ||
1011 | perf_max_counters = nr_counters_generic; | ||
1012 | |||
1013 | pr_info("... value mask: %016Lx\n", counter_value_mask); | ||
1014 | |||
1015 | if (nr_counters_fixed > X86_PMC_MAX_FIXED) { | ||
1016 | nr_counters_fixed = X86_PMC_MAX_FIXED; | ||
1017 | WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", | ||
1018 | nr_counters_fixed, X86_PMC_MAX_FIXED); | ||
1019 | } | ||
1020 | pr_info("... fixed counters: %d\n", nr_counters_fixed); | ||
1021 | |||
1022 | perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; | ||
1023 | |||
1024 | pr_info("... counter mask: %016Lx\n", perf_counter_mask); | ||
1025 | perf_counters_initialized = true; | ||
1026 | |||
1027 | perf_counters_lapic_init(0); | ||
1028 | register_die_notifier(&perf_counter_nmi_notifier); | ||
1029 | } | ||
1030 | |||
1031 | static void pmc_generic_read(struct perf_counter *counter) | ||
1032 | { | ||
1033 | x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); | ||
1034 | } | ||
1035 | |||
1036 | static const struct hw_perf_counter_ops x86_perf_counter_ops = { | ||
1037 | .enable = pmc_generic_enable, | ||
1038 | .disable = pmc_generic_disable, | ||
1039 | .read = pmc_generic_read, | ||
1040 | }; | ||
1041 | |||
1042 | const struct hw_perf_counter_ops * | ||
1043 | hw_perf_counter_init(struct perf_counter *counter) | ||
1044 | { | ||
1045 | int err; | ||
1046 | |||
1047 | err = __hw_perf_counter_init(counter); | ||
1048 | if (err) | ||
1049 | return ERR_PTR(err); | ||
1050 | |||
1051 | return &x86_perf_counter_ops; | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * callchain support | ||
1056 | */ | ||
1057 | |||
1058 | static inline | ||
1059 | void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) | ||
1060 | { | ||
1061 | if (entry->nr < MAX_STACK_DEPTH) | ||
1062 | entry->ip[entry->nr++] = ip; | ||
1063 | } | ||
1064 | |||
1065 | static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); | ||
1066 | static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); | ||
1067 | |||
1068 | |||
1069 | static void | ||
1070 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
1071 | { | ||
1072 | /* Ignore warnings */ | ||
1073 | } | ||
1074 | |||
1075 | static void backtrace_warning(void *data, char *msg) | ||
1076 | { | ||
1077 | /* Ignore warnings */ | ||
1078 | } | ||
1079 | |||
1080 | static int backtrace_stack(void *data, char *name) | ||
1081 | { | ||
1082 | /* Don't bother with IRQ stacks for now */ | ||
1083 | return -1; | ||
1084 | } | ||
1085 | |||
1086 | static void backtrace_address(void *data, unsigned long addr, int reliable) | ||
1087 | { | ||
1088 | struct perf_callchain_entry *entry = data; | ||
1089 | |||
1090 | if (reliable) | ||
1091 | callchain_store(entry, addr); | ||
1092 | } | ||
1093 | |||
1094 | static const struct stacktrace_ops backtrace_ops = { | ||
1095 | .warning = backtrace_warning, | ||
1096 | .warning_symbol = backtrace_warning_symbol, | ||
1097 | .stack = backtrace_stack, | ||
1098 | .address = backtrace_address, | ||
1099 | }; | ||
1100 | |||
1101 | static void | ||
1102 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1103 | { | ||
1104 | unsigned long bp; | ||
1105 | char *stack; | ||
1106 | int nr = entry->nr; | ||
1107 | |||
1108 | callchain_store(entry, instruction_pointer(regs)); | ||
1109 | |||
1110 | stack = ((char *)regs + sizeof(struct pt_regs)); | ||
1111 | #ifdef CONFIG_FRAME_POINTER | ||
1112 | bp = frame_pointer(regs); | ||
1113 | #else | ||
1114 | bp = 0; | ||
1115 | #endif | ||
1116 | |||
1117 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); | ||
1118 | |||
1119 | entry->kernel = entry->nr - nr; | ||
1120 | } | ||
1121 | |||
1122 | |||
1123 | struct stack_frame { | ||
1124 | const void __user *next_fp; | ||
1125 | unsigned long return_address; | ||
1126 | }; | ||
1127 | |||
1128 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | ||
1129 | { | ||
1130 | int ret; | ||
1131 | |||
1132 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
1133 | return 0; | ||
1134 | |||
1135 | ret = 1; | ||
1136 | pagefault_disable(); | ||
1137 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
1138 | ret = 0; | ||
1139 | pagefault_enable(); | ||
1140 | |||
1141 | return ret; | ||
1142 | } | ||
1143 | |||
1144 | static void | ||
1145 | perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1146 | { | ||
1147 | struct stack_frame frame; | ||
1148 | const void __user *fp; | ||
1149 | int nr = entry->nr; | ||
1150 | |||
1151 | regs = (struct pt_regs *)current->thread.sp0 - 1; | ||
1152 | fp = (void __user *)regs->bp; | ||
1153 | |||
1154 | callchain_store(entry, regs->ip); | ||
1155 | |||
1156 | while (entry->nr < MAX_STACK_DEPTH) { | ||
1157 | frame.next_fp = NULL; | ||
1158 | frame.return_address = 0; | ||
1159 | |||
1160 | if (!copy_stack_frame(fp, &frame)) | ||
1161 | break; | ||
1162 | |||
1163 | if ((unsigned long)fp < user_stack_pointer(regs)) | ||
1164 | break; | ||
1165 | |||
1166 | callchain_store(entry, frame.return_address); | ||
1167 | fp = frame.next_fp; | ||
1168 | } | ||
1169 | |||
1170 | entry->user = entry->nr - nr; | ||
1171 | } | ||
1172 | |||
1173 | static void | ||
1174 | perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1175 | { | ||
1176 | int is_user; | ||
1177 | |||
1178 | if (!regs) | ||
1179 | return; | ||
1180 | |||
1181 | is_user = user_mode(regs); | ||
1182 | |||
1183 | if (!current || current->pid == 0) | ||
1184 | return; | ||
1185 | |||
1186 | if (is_user && current->state != TASK_RUNNING) | ||
1187 | return; | ||
1188 | |||
1189 | if (!is_user) | ||
1190 | perf_callchain_kernel(regs, entry); | ||
1191 | |||
1192 | if (current->mm) | ||
1193 | perf_callchain_user(regs, entry); | ||
1194 | } | ||
1195 | |||
1196 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1197 | { | ||
1198 | struct perf_callchain_entry *entry; | ||
1199 | |||
1200 | if (in_nmi()) | ||
1201 | entry = &__get_cpu_var(nmi_entry); | ||
1202 | else | ||
1203 | entry = &__get_cpu_var(irq_entry); | ||
1204 | |||
1205 | entry->nr = 0; | ||
1206 | entry->hv = 0; | ||
1207 | entry->kernel = 0; | ||
1208 | entry->user = 0; | ||
1209 | |||
1210 | perf_do_callchain(regs, entry); | ||
1211 | |||
1212 | return entry; | ||
1213 | } | ||
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e32..d6f5b9fbde32 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -19,8 +19,8 @@ | |||
19 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
20 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
21 | 21 | ||
22 | #include <asm/genapic.h> | 22 | #include <asm/apic.h> |
23 | #include <asm/intel_arch_perfmon.h> | 23 | #include <asm/perf_counter.h> |
24 | 24 | ||
25 | struct nmi_watchdog_ctlblk { | 25 | struct nmi_watchdog_ctlblk { |
26 | unsigned int cccr_msr; | 26 | unsigned int cccr_msr; |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec38af9e..1d46cba56fd8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1025,6 +1025,13 @@ apicinterrupt ERROR_APIC_VECTOR \ | |||
1025 | apicinterrupt SPURIOUS_APIC_VECTOR \ | 1025 | apicinterrupt SPURIOUS_APIC_VECTOR \ |
1026 | spurious_interrupt smp_spurious_interrupt | 1026 | spurious_interrupt smp_spurious_interrupt |
1027 | 1027 | ||
1028 | #ifdef CONFIG_PERF_COUNTERS | ||
1029 | apicinterrupt LOCAL_PERF_VECTOR \ | ||
1030 | perf_counter_interrupt smp_perf_counter_interrupt | ||
1031 | apicinterrupt LOCAL_PENDING_VECTOR \ | ||
1032 | perf_pending_interrupt smp_perf_pending_interrupt | ||
1033 | #endif | ||
1034 | |||
1028 | /* | 1035 | /* |
1029 | * Exception entry points. | 1036 | * Exception entry points. |
1030 | */ | 1037 | */ |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8b..d465487da587 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
63 | for_each_online_cpu(j) | 63 | for_each_online_cpu(j) |
64 | seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); | 64 | seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); |
65 | seq_printf(p, " Spurious interrupts\n"); | 65 | seq_printf(p, " Spurious interrupts\n"); |
66 | seq_printf(p, "CNT: "); | ||
67 | for_each_online_cpu(j) | ||
68 | seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); | ||
69 | seq_printf(p, " Performance counter interrupts\n"); | ||
70 | seq_printf(p, "PND: "); | ||
71 | for_each_online_cpu(j) | ||
72 | seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); | ||
73 | seq_printf(p, " Performance pending work\n"); | ||
66 | #endif | 74 | #endif |
67 | if (generic_interrupt_extension) { | 75 | if (generic_interrupt_extension) { |
68 | seq_printf(p, "PLT: "); | 76 | seq_printf(p, "PLT: "); |
@@ -166,6 +174,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
166 | #ifdef CONFIG_X86_LOCAL_APIC | 174 | #ifdef CONFIG_X86_LOCAL_APIC |
167 | sum += irq_stats(cpu)->apic_timer_irqs; | 175 | sum += irq_stats(cpu)->apic_timer_irqs; |
168 | sum += irq_stats(cpu)->irq_spurious_count; | 176 | sum += irq_stats(cpu)->irq_spurious_count; |
177 | sum += irq_stats(cpu)->apic_perf_irqs; | ||
178 | sum += irq_stats(cpu)->apic_pending_irqs; | ||
169 | #endif | 179 | #endif |
170 | if (generic_interrupt_extension) | 180 | if (generic_interrupt_extension) |
171 | sum += irq_stats(cpu)->generic_irqs; | 181 | sum += irq_stats(cpu)->generic_irqs; |
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f9..3190a6b961e6 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c | |||
@@ -118,28 +118,8 @@ int vector_used_by_percpu_irq(unsigned int vector) | |||
118 | return 0; | 118 | return 0; |
119 | } | 119 | } |
120 | 120 | ||
121 | /* Overridden in paravirt.c */ | 121 | static void __init smp_intr_init(void) |
122 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
123 | |||
124 | void __init native_init_IRQ(void) | ||
125 | { | 122 | { |
126 | int i; | ||
127 | |||
128 | /* Execute any quirks before the call gates are initialised: */ | ||
129 | x86_quirk_pre_intr_init(); | ||
130 | |||
131 | /* | ||
132 | * Cover the whole vector space, no vector can escape | ||
133 | * us. (some of these will be overridden and become | ||
134 | * 'special' SMP interrupts) | ||
135 | */ | ||
136 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | ||
137 | /* SYSCALL_VECTOR was reserved in trap_init. */ | ||
138 | if (i != SYSCALL_VECTOR) | ||
139 | set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); | ||
140 | } | ||
141 | |||
142 | |||
143 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) | 123 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) |
144 | /* | 124 | /* |
145 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 125 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
@@ -168,6 +148,11 @@ void __init native_init_IRQ(void) | |||
168 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | 148 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); |
169 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); | 149 | set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); |
170 | #endif | 150 | #endif |
151 | } | ||
152 | |||
153 | static void __init apic_intr_init(void) | ||
154 | { | ||
155 | smp_intr_init(); | ||
171 | 156 | ||
172 | #ifdef CONFIG_X86_LOCAL_APIC | 157 | #ifdef CONFIG_X86_LOCAL_APIC |
173 | /* self generated IPI for local APIC timer */ | 158 | /* self generated IPI for local APIC timer */ |
@@ -179,12 +164,41 @@ void __init native_init_IRQ(void) | |||
179 | /* IPI vectors for APIC spurious and error interrupts */ | 164 | /* IPI vectors for APIC spurious and error interrupts */ |
180 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 165 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
181 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 166 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
182 | #endif | 167 | # ifdef CONFIG_PERF_COUNTERS |
168 | alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); | ||
169 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); | ||
170 | # endif | ||
183 | 171 | ||
184 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) | 172 | # ifdef CONFIG_X86_MCE_P4THERMAL |
185 | /* thermal monitor LVT interrupt */ | 173 | /* thermal monitor LVT interrupt */ |
186 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 174 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
175 | # endif | ||
187 | #endif | 176 | #endif |
177 | } | ||
178 | |||
179 | /* Overridden in paravirt.c */ | ||
180 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | ||
181 | |||
182 | void __init native_init_IRQ(void) | ||
183 | { | ||
184 | int i; | ||
185 | |||
186 | /* Execute any quirks before the call gates are initialised: */ | ||
187 | x86_quirk_pre_intr_init(); | ||
188 | |||
189 | apic_intr_init(); | ||
190 | |||
191 | /* | ||
192 | * Cover the whole vector space, no vector can escape | ||
193 | * us. (some of these will be overridden and become | ||
194 | * 'special' SMP interrupts) | ||
195 | */ | ||
196 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
197 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
198 | /* SYSCALL_VECTOR was reserved in trap_init. */ | ||
199 | if (!test_bit(vector, used_vectors)) | ||
200 | set_intr_gate(vector, interrupt[i]); | ||
201 | } | ||
188 | 202 | ||
189 | if (!acpi_ioapic) | 203 | if (!acpi_ioapic) |
190 | setup_irq(2, &irq2); | 204 | setup_irq(2, &irq2); |
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8cd10537fd46..53ceb26f80ff 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c | |||
@@ -152,6 +152,12 @@ static void __init apic_intr_init(void) | |||
152 | /* IPI vectors for APIC spurious and error interrupts */ | 152 | /* IPI vectors for APIC spurious and error interrupts */ |
153 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 153 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
154 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 154 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
155 | |||
156 | /* Performance monitoring interrupt: */ | ||
157 | #ifdef CONFIG_PERF_COUNTERS | ||
158 | alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); | ||
159 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); | ||
160 | #endif | ||
155 | } | 161 | } |
156 | 162 | ||
157 | void __init native_init_IRQ(void) | 163 | void __init native_init_IRQ(void) |
@@ -159,6 +165,9 @@ void __init native_init_IRQ(void) | |||
159 | int i; | 165 | int i; |
160 | 166 | ||
161 | init_ISA_irqs(); | 167 | init_ISA_irqs(); |
168 | |||
169 | apic_intr_init(); | ||
170 | |||
162 | /* | 171 | /* |
163 | * Cover the whole vector space, no vector can escape | 172 | * Cover the whole vector space, no vector can escape |
164 | * us. (some of these will be overridden and become | 173 | * us. (some of these will be overridden and become |
@@ -166,12 +175,10 @@ void __init native_init_IRQ(void) | |||
166 | */ | 175 | */ |
167 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | 176 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { |
168 | int vector = FIRST_EXTERNAL_VECTOR + i; | 177 | int vector = FIRST_EXTERNAL_VECTOR + i; |
169 | if (vector != IA32_SYSCALL_VECTOR) | 178 | if (!test_bit(vector, used_vectors)) |
170 | set_intr_gate(vector, interrupt[i]); | 179 | set_intr_gate(vector, interrupt[i]); |
171 | } | 180 | } |
172 | 181 | ||
173 | apic_intr_init(); | ||
174 | |||
175 | if (!acpi_ioapic) | 182 | if (!acpi_ioapic) |
176 | setup_irq(2, &irq2); | 183 | setup_irq(2, &irq2); |
177 | } | 184 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..0a813b17b172 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes | 6 | * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes |
7 | * 2000-2002 x86-64 support by Andi Kleen | 7 | * 2000-2002 x86-64 support by Andi Kleen |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
11 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
12 | #include <linux/smp.h> | 11 | #include <linux/smp.h> |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b491..c3ebbb901379 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -332,5 +332,6 @@ ENTRY(sys_call_table) | |||
332 | .long sys_dup3 /* 330 */ | 332 | .long sys_dup3 /* 330 */ |
333 | .long sys_pipe2 | 333 | .long sys_pipe2 |
334 | .long sys_inotify_init1 | 334 | .long sys_inotify_init1 |
335 | .long sys_perf_counter_open | ||
335 | .long sys_preadv | 336 | .long sys_preadv |
336 | .long sys_pwritev | 337 | .long sys_pwritev |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..2cc162e09c4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -945,8 +945,13 @@ void __init trap_init(void) | |||
945 | #endif | 945 | #endif |
946 | set_intr_gate(19, &simd_coprocessor_error); | 946 | set_intr_gate(19, &simd_coprocessor_error); |
947 | 947 | ||
948 | /* Reserve all the builtin and the syscall vector: */ | ||
949 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | ||
950 | set_bit(i, used_vectors); | ||
951 | |||
948 | #ifdef CONFIG_IA32_EMULATION | 952 | #ifdef CONFIG_IA32_EMULATION |
949 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | 953 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); |
954 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); | ||
950 | #endif | 955 | #endif |
951 | 956 | ||
952 | #ifdef CONFIG_X86_32 | 957 | #ifdef CONFIG_X86_32 |
@@ -963,17 +968,9 @@ void __init trap_init(void) | |||
963 | } | 968 | } |
964 | 969 | ||
965 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); | 970 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
966 | #endif | ||
967 | |||
968 | /* Reserve all the builtin and the syscall vector: */ | ||
969 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | ||
970 | set_bit(i, used_vectors); | ||
971 | |||
972 | #ifdef CONFIG_X86_64 | ||
973 | set_bit(IA32_SYSCALL_VECTOR, used_vectors); | ||
974 | #else | ||
975 | set_bit(SYSCALL_VECTOR, used_vectors); | 971 | set_bit(SYSCALL_VECTOR, used_vectors); |
976 | #endif | 972 | #endif |
973 | |||
977 | /* | 974 | /* |
978 | * Should be a barrier for any external CPU state: | 975 | * Should be a barrier for any external CPU state: |
979 | */ | 976 | */ |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa0..f2d3324d9215 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/tty.h> | 27 | #include <linux/tty.h> |
28 | #include <linux/smp.h> | 28 | #include <linux/smp.h> |
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/perf_counter.h> | ||
30 | 31 | ||
31 | #include <asm-generic/sections.h> | 32 | #include <asm-generic/sections.h> |
32 | 33 | ||
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1044 | if (unlikely(error_code & PF_RSVD)) | 1045 | if (unlikely(error_code & PF_RSVD)) |
1045 | pgtable_bad(regs, error_code, address); | 1046 | pgtable_bad(regs, error_code, address); |
1046 | 1047 | ||
1048 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); | ||
1049 | |||
1047 | /* | 1050 | /* |
1048 | * If we're in an interrupt, have no user context or are running | 1051 | * If we're in an interrupt, have no user context or are running |
1049 | * in an atomic region then we must not take the fault: | 1052 | * in an atomic region then we must not take the fault: |
@@ -1137,10 +1140,13 @@ good_area: | |||
1137 | return; | 1140 | return; |
1138 | } | 1141 | } |
1139 | 1142 | ||
1140 | if (fault & VM_FAULT_MAJOR) | 1143 | if (fault & VM_FAULT_MAJOR) { |
1141 | tsk->maj_flt++; | 1144 | tsk->maj_flt++; |
1142 | else | 1145 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); |
1146 | } else { | ||
1143 | tsk->min_flt++; | 1147 | tsk->min_flt++; |
1148 | perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); | ||
1149 | } | ||
1144 | 1150 | ||
1145 | check_v8086_mode(regs, address, tsk); | 1151 | check_v8086_mode(regs, address, tsk); |
1146 | 1152 | ||
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a7..c638685136e1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self, | |||
40 | 40 | ||
41 | switch (val) { | 41 | switch (val) { |
42 | case DIE_NMI: | 42 | case DIE_NMI: |
43 | if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) | 43 | case DIE_NMI_IPI: |
44 | ret = NOTIFY_STOP; | 44 | model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); |
45 | ret = NOTIFY_STOP; | ||
45 | break; | 46 | break; |
46 | default: | 47 | default: |
47 | break; | 48 | break; |
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy) | |||
134 | static struct notifier_block profile_exceptions_nb = { | 135 | static struct notifier_block profile_exceptions_nb = { |
135 | .notifier_call = profile_exceptions_notify, | 136 | .notifier_call = profile_exceptions_notify, |
136 | .next = NULL, | 137 | .next = NULL, |
137 | .priority = 0 | 138 | .priority = 2 |
138 | }; | 139 | }; |
139 | 140 | ||
140 | static int nmi_setup(void) | 141 | static int nmi_setup(void) |
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 10131fbdaada..4da7230b3d17 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <asm/msr.h> | 18 | #include <asm/msr.h> |
19 | #include <asm/apic.h> | 19 | #include <asm/apic.h> |
20 | #include <asm/nmi.h> | 20 | #include <asm/nmi.h> |
21 | #include <asm/intel_arch_perfmon.h> | 21 | #include <asm/perf_counter.h> |
22 | 22 | ||
23 | #include "op_x86_model.h" | 23 | #include "op_x86_model.h" |
24 | #include "op_counter.h" | 24 | #include "op_counter.h" |
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, | |||
136 | u64 val; | 136 | u64 val; |
137 | int i; | 137 | int i; |
138 | 138 | ||
139 | /* | ||
140 | * This can happen if perf counters are in use when | ||
141 | * we steal the die notifier NMI. | ||
142 | */ | ||
143 | if (unlikely(!reset_value)) | ||
144 | goto out; | ||
145 | |||
139 | for (i = 0 ; i < num_counters; ++i) { | 146 | for (i = 0 ; i < num_counters; ++i) { |
140 | if (!reset_value[i]) | 147 | if (!reset_value[i]) |
141 | continue; | 148 | continue; |
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, | |||
146 | } | 153 | } |
147 | } | 154 | } |
148 | 155 | ||
156 | out: | ||
149 | /* Only P6 based Pentium M need to re-unmask the apic vector but it | 157 | /* Only P6 based Pentium M need to re-unmask the apic vector but it |
150 | * doesn't hurt other P6 variant */ | 158 | * doesn't hurt other P6 variant */ |
151 | apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); | 159 | apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); |
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 4e6e758bd397..429be896a030 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c | |||
@@ -757,8 +757,11 @@ static int acpi_idle_bm_check(void) | |||
757 | */ | 757 | */ |
758 | static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) | 758 | static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) |
759 | { | 759 | { |
760 | u64 perf_flags; | ||
761 | |||
760 | /* Don't trace irqs off for idle */ | 762 | /* Don't trace irqs off for idle */ |
761 | stop_critical_timings(); | 763 | stop_critical_timings(); |
764 | perf_flags = hw_perf_save_disable(); | ||
762 | if (cx->entry_method == ACPI_CSTATE_FFH) { | 765 | if (cx->entry_method == ACPI_CSTATE_FFH) { |
763 | /* Call into architectural FFH based C-state */ | 766 | /* Call into architectural FFH based C-state */ |
764 | acpi_processor_ffh_cstate_enter(cx); | 767 | acpi_processor_ffh_cstate_enter(cx); |
@@ -773,6 +776,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) | |||
773 | gets asserted in time to freeze execution properly. */ | 776 | gets asserted in time to freeze execution properly. */ |
774 | unused = inl(acpi_gbl_FADT.xpm_timer_block.address); | 777 | unused = inl(acpi_gbl_FADT.xpm_timer_block.address); |
775 | } | 778 | } |
779 | hw_perf_restore(perf_flags); | ||
776 | start_critical_timings(); | 780 | start_critical_timings(); |
777 | } | 781 | } |
778 | 782 | ||
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 6de020d078e1..0540d5de2c17 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/kbd_kern.h> | 25 | #include <linux/kbd_kern.h> |
26 | #include <linux/proc_fs.h> | 26 | #include <linux/proc_fs.h> |
27 | #include <linux/quotaops.h> | 27 | #include <linux/quotaops.h> |
28 | #include <linux/perf_counter.h> | ||
28 | #include <linux/kernel.h> | 29 | #include <linux/kernel.h> |
29 | #include <linux/module.h> | 30 | #include <linux/module.h> |
30 | #include <linux/suspend.h> | 31 | #include <linux/suspend.h> |
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) | |||
244 | struct pt_regs *regs = get_irq_regs(); | 245 | struct pt_regs *regs = get_irq_regs(); |
245 | if (regs) | 246 | if (regs) |
246 | show_regs(regs); | 247 | show_regs(regs); |
248 | perf_counter_print_debug(); | ||
247 | } | 249 | } |
248 | static struct sysrq_key_op sysrq_showregs_op = { | 250 | static struct sysrq_key_op sysrq_showregs_op = { |
249 | .handler = sysrq_handle_showregs, | 251 | .handler = sysrq_handle_showregs, |
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/string.h> | 33 | #include <linux/string.h> |
34 | #include <linux/init.h> | 34 | #include <linux/init.h> |
35 | #include <linux/pagemap.h> | 35 | #include <linux/pagemap.h> |
36 | #include <linux/perf_counter.h> | ||
36 | #include <linux/highmem.h> | 37 | #include <linux/highmem.h> |
37 | #include <linux/spinlock.h> | 38 | #include <linux/spinlock.h> |
38 | #include <linux/key.h> | 39 | #include <linux/key.h> |
@@ -1018,6 +1019,13 @@ int flush_old_exec(struct linux_binprm * bprm) | |||
1018 | 1019 | ||
1019 | current->personality &= ~bprm->per_clear; | 1020 | current->personality &= ~bprm->per_clear; |
1020 | 1021 | ||
1022 | /* | ||
1023 | * Flush performance counters when crossing a | ||
1024 | * security domain: | ||
1025 | */ | ||
1026 | if (!get_dumpable(current->mm)) | ||
1027 | perf_counter_exit_task(current); | ||
1028 | |||
1021 | /* An exec changes our domain. We are no longer part of the thread | 1029 | /* An exec changes our domain. We are no longer part of the thread |
1022 | group */ | 1030 | group */ |
1023 | 1031 | ||
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index af1de95e711e..ca226a91abee 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -120,6 +120,18 @@ extern struct group_info init_groups; | |||
120 | 120 | ||
121 | extern struct cred init_cred; | 121 | extern struct cred init_cred; |
122 | 122 | ||
123 | #ifdef CONFIG_PERF_COUNTERS | ||
124 | # define INIT_PERF_COUNTERS(tsk) \ | ||
125 | .perf_counter_ctx.counter_list = \ | ||
126 | LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ | ||
127 | .perf_counter_ctx.event_list = \ | ||
128 | LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ | ||
129 | .perf_counter_ctx.lock = \ | ||
130 | __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), | ||
131 | #else | ||
132 | # define INIT_PERF_COUNTERS(tsk) | ||
133 | #endif | ||
134 | |||
123 | /* | 135 | /* |
124 | * INIT_TASK is used to set up the first task table, touch at | 136 | * INIT_TASK is used to set up the first task table, touch at |
125 | * your own risk!. Base=0, limit=0x1fffff (=2MB) | 137 | * your own risk!. Base=0, limit=0x1fffff (=2MB) |
@@ -185,6 +197,7 @@ extern struct cred init_cred; | |||
185 | INIT_IDS \ | 197 | INIT_IDS \ |
186 | INIT_TRACE_IRQFLAGS \ | 198 | INIT_TRACE_IRQFLAGS \ |
187 | INIT_LOCKDEP \ | 199 | INIT_LOCKDEP \ |
200 | INIT_PERF_COUNTERS(tsk) \ | ||
188 | } | 201 | } |
189 | 202 | ||
190 | 203 | ||
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 0c8b89f28a95..080d1fd461d7 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
@@ -81,7 +81,13 @@ static inline unsigned int kstat_irqs(unsigned int irq) | |||
81 | return sum; | 81 | return sum; |
82 | } | 82 | } |
83 | 83 | ||
84 | |||
85 | /* | ||
86 | * Lock/unlock the current runqueue - to extract task statistics: | ||
87 | */ | ||
88 | extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); | ||
84 | extern unsigned long long task_delta_exec(struct task_struct *); | 89 | extern unsigned long long task_delta_exec(struct task_struct *); |
90 | |||
85 | extern void account_user_time(struct task_struct *, cputime_t, cputime_t); | 91 | extern void account_user_time(struct task_struct *, cputime_t, cputime_t); |
86 | extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); | 92 | extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); |
87 | extern void account_steal_time(cputime_t); | 93 | extern void account_steal_time(cputime_t); |
diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab8..93054fc3635c 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h | |||
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); | |||
151 | extern int mutex_trylock(struct mutex *lock); | 151 | extern int mutex_trylock(struct mutex *lock); |
152 | extern void mutex_unlock(struct mutex *lock); | 152 | extern void mutex_unlock(struct mutex *lock); |
153 | 153 | ||
154 | /** | ||
155 | * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 | ||
156 | * @cnt: the atomic which we are to dec | ||
157 | * @lock: the mutex to return holding if we dec to 0 | ||
158 | * | ||
159 | * return true and hold lock if we dec to 0, return false otherwise | ||
160 | */ | ||
161 | static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) | ||
162 | { | ||
163 | /* dec if we can't possibly hit 0 */ | ||
164 | if (atomic_add_unless(cnt, -1, 1)) | ||
165 | return 0; | ||
166 | /* we might hit 0, so take the lock */ | ||
167 | mutex_lock(lock); | ||
168 | if (!atomic_dec_and_test(cnt)) { | ||
169 | /* when we actually did the dec, we didn't hit 0 */ | ||
170 | mutex_unlock(lock); | ||
171 | return 0; | ||
172 | } | ||
173 | /* we hit 0, and we hold the lock */ | ||
174 | return 1; | ||
175 | } | ||
176 | |||
154 | #endif | 177 | #endif |
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 000000000000..7f5d353d78ac --- /dev/null +++ b/include/linux/perf_counter.h | |||
@@ -0,0 +1,591 @@ | |||
1 | /* | ||
2 | * Performance counters: | ||
3 | * | ||
4 | * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar | ||
6 | * | ||
7 | * Data type definitions, declarations, prototypes. | ||
8 | * | ||
9 | * Started by: Thomas Gleixner and Ingo Molnar | ||
10 | * | ||
11 | * For licencing details see kernel-base/COPYING | ||
12 | */ | ||
13 | #ifndef _LINUX_PERF_COUNTER_H | ||
14 | #define _LINUX_PERF_COUNTER_H | ||
15 | |||
16 | #include <linux/types.h> | ||
17 | #include <linux/ioctl.h> | ||
18 | #include <asm/byteorder.h> | ||
19 | |||
20 | /* | ||
21 | * User-space ABI bits: | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * hw_event.type | ||
26 | */ | ||
27 | enum perf_event_types { | ||
28 | PERF_TYPE_HARDWARE = 0, | ||
29 | PERF_TYPE_SOFTWARE = 1, | ||
30 | PERF_TYPE_TRACEPOINT = 2, | ||
31 | |||
32 | /* | ||
33 | * available TYPE space, raw is the max value. | ||
34 | */ | ||
35 | |||
36 | PERF_TYPE_RAW = 128, | ||
37 | }; | ||
38 | |||
39 | /* | ||
40 | * Generalized performance counter event types, used by the hw_event.event_id | ||
41 | * parameter of the sys_perf_counter_open() syscall: | ||
42 | */ | ||
43 | enum hw_event_ids { | ||
44 | /* | ||
45 | * Common hardware events, generalized by the kernel: | ||
46 | */ | ||
47 | PERF_COUNT_CPU_CYCLES = 0, | ||
48 | PERF_COUNT_INSTRUCTIONS = 1, | ||
49 | PERF_COUNT_CACHE_REFERENCES = 2, | ||
50 | PERF_COUNT_CACHE_MISSES = 3, | ||
51 | PERF_COUNT_BRANCH_INSTRUCTIONS = 4, | ||
52 | PERF_COUNT_BRANCH_MISSES = 5, | ||
53 | PERF_COUNT_BUS_CYCLES = 6, | ||
54 | |||
55 | PERF_HW_EVENTS_MAX = 7, | ||
56 | }; | ||
57 | |||
58 | /* | ||
59 | * Special "software" counters provided by the kernel, even if the hardware | ||
60 | * does not support performance counters. These counters measure various | ||
61 | * physical and sw events of the kernel (and allow the profiling of them as | ||
62 | * well): | ||
63 | */ | ||
64 | enum sw_event_ids { | ||
65 | PERF_COUNT_CPU_CLOCK = 0, | ||
66 | PERF_COUNT_TASK_CLOCK = 1, | ||
67 | PERF_COUNT_PAGE_FAULTS = 2, | ||
68 | PERF_COUNT_CONTEXT_SWITCHES = 3, | ||
69 | PERF_COUNT_CPU_MIGRATIONS = 4, | ||
70 | PERF_COUNT_PAGE_FAULTS_MIN = 5, | ||
71 | PERF_COUNT_PAGE_FAULTS_MAJ = 6, | ||
72 | |||
73 | PERF_SW_EVENTS_MAX = 7, | ||
74 | }; | ||
75 | |||
76 | #define __PERF_COUNTER_MASK(name) \ | ||
77 | (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ | ||
78 | PERF_COUNTER_##name##_SHIFT) | ||
79 | |||
80 | #define PERF_COUNTER_RAW_BITS 1 | ||
81 | #define PERF_COUNTER_RAW_SHIFT 63 | ||
82 | #define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) | ||
83 | |||
84 | #define PERF_COUNTER_CONFIG_BITS 63 | ||
85 | #define PERF_COUNTER_CONFIG_SHIFT 0 | ||
86 | #define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) | ||
87 | |||
88 | #define PERF_COUNTER_TYPE_BITS 7 | ||
89 | #define PERF_COUNTER_TYPE_SHIFT 56 | ||
90 | #define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) | ||
91 | |||
92 | #define PERF_COUNTER_EVENT_BITS 56 | ||
93 | #define PERF_COUNTER_EVENT_SHIFT 0 | ||
94 | #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) | ||
95 | |||
96 | /* | ||
97 | * Bits that can be set in hw_event.record_type to request information | ||
98 | * in the overflow packets. | ||
99 | */ | ||
100 | enum perf_counter_record_format { | ||
101 | PERF_RECORD_IP = 1U << 0, | ||
102 | PERF_RECORD_TID = 1U << 1, | ||
103 | PERF_RECORD_GROUP = 1U << 2, | ||
104 | PERF_RECORD_CALLCHAIN = 1U << 3, | ||
105 | PERF_RECORD_TIME = 1U << 4, | ||
106 | }; | ||
107 | |||
108 | /* | ||
109 | * Bits that can be set in hw_event.read_format to request that | ||
110 | * reads on the counter should return the indicated quantities, | ||
111 | * in increasing order of bit value, after the counter value. | ||
112 | */ | ||
113 | enum perf_counter_read_format { | ||
114 | PERF_FORMAT_TOTAL_TIME_ENABLED = 1, | ||
115 | PERF_FORMAT_TOTAL_TIME_RUNNING = 2, | ||
116 | }; | ||
117 | |||
118 | /* | ||
119 | * Hardware event to monitor via a performance monitoring counter: | ||
120 | */ | ||
121 | struct perf_counter_hw_event { | ||
122 | /* | ||
123 | * The MSB of the config word signifies if the rest contains cpu | ||
124 | * specific (raw) counter configuration data, if unset, the next | ||
125 | * 7 bits are an event type and the rest of the bits are the event | ||
126 | * identifier. | ||
127 | */ | ||
128 | __u64 config; | ||
129 | |||
130 | __u64 irq_period; | ||
131 | __u32 record_type; | ||
132 | __u32 read_format; | ||
133 | |||
134 | __u64 disabled : 1, /* off by default */ | ||
135 | nmi : 1, /* NMI sampling */ | ||
136 | inherit : 1, /* children inherit it */ | ||
137 | pinned : 1, /* must always be on PMU */ | ||
138 | exclusive : 1, /* only group on PMU */ | ||
139 | exclude_user : 1, /* don't count user */ | ||
140 | exclude_kernel : 1, /* ditto kernel */ | ||
141 | exclude_hv : 1, /* ditto hypervisor */ | ||
142 | exclude_idle : 1, /* don't count when idle */ | ||
143 | mmap : 1, /* include mmap data */ | ||
144 | munmap : 1, /* include munmap data */ | ||
145 | |||
146 | __reserved_1 : 53; | ||
147 | |||
148 | __u32 extra_config_len; | ||
149 | __u32 wakeup_events; /* wakeup every n events */ | ||
150 | |||
151 | __u64 __reserved_2; | ||
152 | __u64 __reserved_3; | ||
153 | }; | ||
154 | |||
155 | /* | ||
156 | * Ioctls that can be done on a perf counter fd: | ||
157 | */ | ||
158 | #define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) | ||
159 | #define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) | ||
160 | #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) | ||
161 | |||
162 | /* | ||
163 | * Structure of the page that can be mapped via mmap | ||
164 | */ | ||
165 | struct perf_counter_mmap_page { | ||
166 | __u32 version; /* version number of this structure */ | ||
167 | __u32 compat_version; /* lowest version this is compat with */ | ||
168 | |||
169 | /* | ||
170 | * Bits needed to read the hw counters in user-space. | ||
171 | * | ||
172 | * u32 seq; | ||
173 | * s64 count; | ||
174 | * | ||
175 | * do { | ||
176 | * seq = pc->lock; | ||
177 | * | ||
178 | * barrier() | ||
179 | * if (pc->index) { | ||
180 | * count = pmc_read(pc->index - 1); | ||
181 | * count += pc->offset; | ||
182 | * } else | ||
183 | * goto regular_read; | ||
184 | * | ||
185 | * barrier(); | ||
186 | * } while (pc->lock != seq); | ||
187 | * | ||
188 | * NOTE: for obvious reason this only works on self-monitoring | ||
189 | * processes. | ||
190 | */ | ||
191 | __u32 lock; /* seqlock for synchronization */ | ||
192 | __u32 index; /* hardware counter identifier */ | ||
193 | __s64 offset; /* add to hardware counter value */ | ||
194 | |||
195 | /* | ||
196 | * Control data for the mmap() data buffer. | ||
197 | * | ||
198 | * User-space reading this value should issue an rmb(), on SMP capable | ||
199 | * platforms, after reading this value -- see perf_counter_wakeup(). | ||
200 | */ | ||
201 | __u32 data_head; /* head in the data section */ | ||
202 | }; | ||
203 | |||
204 | struct perf_event_header { | ||
205 | __u32 type; | ||
206 | __u32 size; | ||
207 | }; | ||
208 | |||
209 | enum perf_event_type { | ||
210 | |||
211 | /* | ||
212 | * The MMAP events record the PROT_EXEC mappings so that we can | ||
213 | * correlate userspace IPs to code. They have the following structure: | ||
214 | * | ||
215 | * struct { | ||
216 | * struct perf_event_header header; | ||
217 | * | ||
218 | * u32 pid, tid; | ||
219 | * u64 addr; | ||
220 | * u64 len; | ||
221 | * u64 pgoff; | ||
222 | * char filename[]; | ||
223 | * }; | ||
224 | */ | ||
225 | PERF_EVENT_MMAP = 1, | ||
226 | PERF_EVENT_MUNMAP = 2, | ||
227 | |||
228 | /* | ||
229 | * Half the event type space is reserved for the counter overflow | ||
230 | * bitfields, as found in hw_event.record_type. | ||
231 | * | ||
232 | * These events will have types of the form: | ||
233 | * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * | ||
234 | * | ||
235 | * struct { | ||
236 | * struct perf_event_header header; | ||
237 | * | ||
238 | * { u64 ip; } && __PERF_EVENT_IP | ||
239 | * { u32 pid, tid; } && __PERF_EVENT_TID | ||
240 | * | ||
241 | * { u64 nr; | ||
242 | * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP | ||
243 | * | ||
244 | * { u16 nr, | ||
245 | * hv, | ||
246 | * kernel, | ||
247 | * user; | ||
248 | * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN | ||
249 | * | ||
250 | * { u64 time; } && __PERF_EVENT_TIME | ||
251 | * }; | ||
252 | */ | ||
253 | PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, | ||
254 | __PERF_EVENT_IP = PERF_RECORD_IP, | ||
255 | __PERF_EVENT_TID = PERF_RECORD_TID, | ||
256 | __PERF_EVENT_GROUP = PERF_RECORD_GROUP, | ||
257 | __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, | ||
258 | __PERF_EVENT_TIME = PERF_RECORD_TIME, | ||
259 | }; | ||
260 | |||
261 | #ifdef __KERNEL__ | ||
262 | /* | ||
263 | * Kernel-internal data types and definitions: | ||
264 | */ | ||
265 | |||
266 | #ifdef CONFIG_PERF_COUNTERS | ||
267 | # include <asm/perf_counter.h> | ||
268 | #endif | ||
269 | |||
270 | #include <linux/list.h> | ||
271 | #include <linux/mutex.h> | ||
272 | #include <linux/rculist.h> | ||
273 | #include <linux/rcupdate.h> | ||
274 | #include <linux/spinlock.h> | ||
275 | #include <linux/hrtimer.h> | ||
276 | #include <linux/fs.h> | ||
277 | #include <asm/atomic.h> | ||
278 | |||
279 | struct task_struct; | ||
280 | |||
281 | static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) | ||
282 | { | ||
283 | return hw_event->config & PERF_COUNTER_RAW_MASK; | ||
284 | } | ||
285 | |||
286 | static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) | ||
287 | { | ||
288 | return hw_event->config & PERF_COUNTER_CONFIG_MASK; | ||
289 | } | ||
290 | |||
291 | static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) | ||
292 | { | ||
293 | return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> | ||
294 | PERF_COUNTER_TYPE_SHIFT; | ||
295 | } | ||
296 | |||
297 | static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) | ||
298 | { | ||
299 | return hw_event->config & PERF_COUNTER_EVENT_MASK; | ||
300 | } | ||
301 | |||
302 | /** | ||
303 | * struct hw_perf_counter - performance counter hardware details: | ||
304 | */ | ||
305 | struct hw_perf_counter { | ||
306 | #ifdef CONFIG_PERF_COUNTERS | ||
307 | union { | ||
308 | struct { /* hardware */ | ||
309 | u64 config; | ||
310 | unsigned long config_base; | ||
311 | unsigned long counter_base; | ||
312 | int nmi; | ||
313 | unsigned int idx; | ||
314 | }; | ||
315 | union { /* software */ | ||
316 | atomic64_t count; | ||
317 | struct hrtimer hrtimer; | ||
318 | }; | ||
319 | }; | ||
320 | atomic64_t prev_count; | ||
321 | u64 irq_period; | ||
322 | atomic64_t period_left; | ||
323 | #endif | ||
324 | }; | ||
325 | |||
326 | struct perf_counter; | ||
327 | |||
328 | /** | ||
329 | * struct hw_perf_counter_ops - performance counter hw ops | ||
330 | */ | ||
331 | struct hw_perf_counter_ops { | ||
332 | int (*enable) (struct perf_counter *counter); | ||
333 | void (*disable) (struct perf_counter *counter); | ||
334 | void (*read) (struct perf_counter *counter); | ||
335 | }; | ||
336 | |||
337 | /** | ||
338 | * enum perf_counter_active_state - the states of a counter | ||
339 | */ | ||
340 | enum perf_counter_active_state { | ||
341 | PERF_COUNTER_STATE_ERROR = -2, | ||
342 | PERF_COUNTER_STATE_OFF = -1, | ||
343 | PERF_COUNTER_STATE_INACTIVE = 0, | ||
344 | PERF_COUNTER_STATE_ACTIVE = 1, | ||
345 | }; | ||
346 | |||
347 | struct file; | ||
348 | |||
349 | struct perf_mmap_data { | ||
350 | struct rcu_head rcu_head; | ||
351 | int nr_pages; | ||
352 | atomic_t wakeup; | ||
353 | atomic_t head; | ||
354 | atomic_t events; | ||
355 | struct perf_counter_mmap_page *user_page; | ||
356 | void *data_pages[0]; | ||
357 | }; | ||
358 | |||
359 | struct perf_pending_entry { | ||
360 | struct perf_pending_entry *next; | ||
361 | void (*func)(struct perf_pending_entry *); | ||
362 | }; | ||
363 | |||
364 | /** | ||
365 | * struct perf_counter - performance counter kernel representation: | ||
366 | */ | ||
367 | struct perf_counter { | ||
368 | #ifdef CONFIG_PERF_COUNTERS | ||
369 | struct list_head list_entry; | ||
370 | struct list_head event_entry; | ||
371 | struct list_head sibling_list; | ||
372 | int nr_siblings; | ||
373 | struct perf_counter *group_leader; | ||
374 | const struct hw_perf_counter_ops *hw_ops; | ||
375 | |||
376 | enum perf_counter_active_state state; | ||
377 | enum perf_counter_active_state prev_state; | ||
378 | atomic64_t count; | ||
379 | |||
380 | /* | ||
381 | * These are the total time in nanoseconds that the counter | ||
382 | * has been enabled (i.e. eligible to run, and the task has | ||
383 | * been scheduled in, if this is a per-task counter) | ||
384 | * and running (scheduled onto the CPU), respectively. | ||
385 | * | ||
386 | * They are computed from tstamp_enabled, tstamp_running and | ||
387 | * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. | ||
388 | */ | ||
389 | u64 total_time_enabled; | ||
390 | u64 total_time_running; | ||
391 | |||
392 | /* | ||
393 | * These are timestamps used for computing total_time_enabled | ||
394 | * and total_time_running when the counter is in INACTIVE or | ||
395 | * ACTIVE state, measured in nanoseconds from an arbitrary point | ||
396 | * in time. | ||
397 | * tstamp_enabled: the notional time when the counter was enabled | ||
398 | * tstamp_running: the notional time when the counter was scheduled on | ||
399 | * tstamp_stopped: in INACTIVE state, the notional time when the | ||
400 | * counter was scheduled off. | ||
401 | */ | ||
402 | u64 tstamp_enabled; | ||
403 | u64 tstamp_running; | ||
404 | u64 tstamp_stopped; | ||
405 | |||
406 | struct perf_counter_hw_event hw_event; | ||
407 | struct hw_perf_counter hw; | ||
408 | |||
409 | struct perf_counter_context *ctx; | ||
410 | struct task_struct *task; | ||
411 | struct file *filp; | ||
412 | |||
413 | struct perf_counter *parent; | ||
414 | struct list_head child_list; | ||
415 | |||
416 | /* | ||
417 | * These accumulate total time (in nanoseconds) that children | ||
418 | * counters have been enabled and running, respectively. | ||
419 | */ | ||
420 | atomic64_t child_total_time_enabled; | ||
421 | atomic64_t child_total_time_running; | ||
422 | |||
423 | /* | ||
424 | * Protect attach/detach and child_list: | ||
425 | */ | ||
426 | struct mutex mutex; | ||
427 | |||
428 | int oncpu; | ||
429 | int cpu; | ||
430 | |||
431 | /* mmap bits */ | ||
432 | struct mutex mmap_mutex; | ||
433 | atomic_t mmap_count; | ||
434 | struct perf_mmap_data *data; | ||
435 | |||
436 | /* poll related */ | ||
437 | wait_queue_head_t waitq; | ||
438 | struct fasync_struct *fasync; | ||
439 | |||
440 | /* delayed work for NMIs and such */ | ||
441 | int pending_wakeup; | ||
442 | int pending_kill; | ||
443 | int pending_disable; | ||
444 | struct perf_pending_entry pending; | ||
445 | |||
446 | atomic_t event_limit; | ||
447 | |||
448 | void (*destroy)(struct perf_counter *); | ||
449 | struct rcu_head rcu_head; | ||
450 | #endif | ||
451 | }; | ||
452 | |||
453 | /** | ||
454 | * struct perf_counter_context - counter context structure | ||
455 | * | ||
456 | * Used as a container for task counters and CPU counters as well: | ||
457 | */ | ||
458 | struct perf_counter_context { | ||
459 | #ifdef CONFIG_PERF_COUNTERS | ||
460 | /* | ||
461 | * Protect the states of the counters in the list, | ||
462 | * nr_active, and the list: | ||
463 | */ | ||
464 | spinlock_t lock; | ||
465 | /* | ||
466 | * Protect the list of counters. Locking either mutex or lock | ||
467 | * is sufficient to ensure the list doesn't change; to change | ||
468 | * the list you need to lock both the mutex and the spinlock. | ||
469 | */ | ||
470 | struct mutex mutex; | ||
471 | |||
472 | struct list_head counter_list; | ||
473 | struct list_head event_list; | ||
474 | int nr_counters; | ||
475 | int nr_active; | ||
476 | int is_active; | ||
477 | struct task_struct *task; | ||
478 | |||
479 | /* | ||
480 | * Context clock, runs when context enabled. | ||
481 | */ | ||
482 | u64 time; | ||
483 | u64 timestamp; | ||
484 | #endif | ||
485 | }; | ||
486 | |||
487 | /** | ||
488 | * struct perf_counter_cpu_context - per cpu counter context structure | ||
489 | */ | ||
490 | struct perf_cpu_context { | ||
491 | struct perf_counter_context ctx; | ||
492 | struct perf_counter_context *task_ctx; | ||
493 | int active_oncpu; | ||
494 | int max_pertask; | ||
495 | int exclusive; | ||
496 | |||
497 | /* | ||
498 | * Recursion avoidance: | ||
499 | * | ||
500 | * task, softirq, irq, nmi context | ||
501 | */ | ||
502 | int recursion[4]; | ||
503 | }; | ||
504 | |||
505 | /* | ||
506 | * Set by architecture code: | ||
507 | */ | ||
508 | extern int perf_max_counters; | ||
509 | |||
510 | #ifdef CONFIG_PERF_COUNTERS | ||
511 | extern const struct hw_perf_counter_ops * | ||
512 | hw_perf_counter_init(struct perf_counter *counter); | ||
513 | |||
514 | extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); | ||
515 | extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); | ||
516 | extern void perf_counter_task_tick(struct task_struct *task, int cpu); | ||
517 | extern void perf_counter_init_task(struct task_struct *child); | ||
518 | extern void perf_counter_exit_task(struct task_struct *child); | ||
519 | extern void perf_counter_do_pending(void); | ||
520 | extern void perf_counter_print_debug(void); | ||
521 | extern void perf_counter_unthrottle(void); | ||
522 | extern u64 hw_perf_save_disable(void); | ||
523 | extern void hw_perf_restore(u64 ctrl); | ||
524 | extern int perf_counter_task_disable(void); | ||
525 | extern int perf_counter_task_enable(void); | ||
526 | extern int hw_perf_group_sched_in(struct perf_counter *group_leader, | ||
527 | struct perf_cpu_context *cpuctx, | ||
528 | struct perf_counter_context *ctx, int cpu); | ||
529 | extern void perf_counter_update_userpage(struct perf_counter *counter); | ||
530 | |||
531 | extern int perf_counter_overflow(struct perf_counter *counter, | ||
532 | int nmi, struct pt_regs *regs); | ||
533 | /* | ||
534 | * Return 1 for a software counter, 0 for a hardware counter | ||
535 | */ | ||
536 | static inline int is_software_counter(struct perf_counter *counter) | ||
537 | { | ||
538 | return !perf_event_raw(&counter->hw_event) && | ||
539 | perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; | ||
540 | } | ||
541 | |||
542 | extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); | ||
543 | |||
544 | extern void perf_counter_mmap(unsigned long addr, unsigned long len, | ||
545 | unsigned long pgoff, struct file *file); | ||
546 | |||
547 | extern void perf_counter_munmap(unsigned long addr, unsigned long len, | ||
548 | unsigned long pgoff, struct file *file); | ||
549 | |||
550 | #define MAX_STACK_DEPTH 255 | ||
551 | |||
552 | struct perf_callchain_entry { | ||
553 | u16 nr, hv, kernel, user; | ||
554 | u64 ip[MAX_STACK_DEPTH]; | ||
555 | }; | ||
556 | |||
557 | extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); | ||
558 | |||
559 | #else | ||
560 | static inline void | ||
561 | perf_counter_task_sched_in(struct task_struct *task, int cpu) { } | ||
562 | static inline void | ||
563 | perf_counter_task_sched_out(struct task_struct *task, int cpu) { } | ||
564 | static inline void | ||
565 | perf_counter_task_tick(struct task_struct *task, int cpu) { } | ||
566 | static inline void perf_counter_init_task(struct task_struct *child) { } | ||
567 | static inline void perf_counter_exit_task(struct task_struct *child) { } | ||
568 | static inline void perf_counter_do_pending(void) { } | ||
569 | static inline void perf_counter_print_debug(void) { } | ||
570 | static inline void perf_counter_unthrottle(void) { } | ||
571 | static inline void hw_perf_restore(u64 ctrl) { } | ||
572 | static inline u64 hw_perf_save_disable(void) { return 0; } | ||
573 | static inline int perf_counter_task_disable(void) { return -EINVAL; } | ||
574 | static inline int perf_counter_task_enable(void) { return -EINVAL; } | ||
575 | |||
576 | static inline void | ||
577 | perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } | ||
578 | |||
579 | |||
580 | static inline void | ||
581 | perf_counter_mmap(unsigned long addr, unsigned long len, | ||
582 | unsigned long pgoff, struct file *file) { } | ||
583 | |||
584 | static inline void | ||
585 | perf_counter_munmap(unsigned long addr, unsigned long len, | ||
586 | unsigned long pgoff, struct file *file) { } | ||
587 | |||
588 | #endif | ||
589 | |||
590 | #endif /* __KERNEL__ */ | ||
591 | #endif /* _LINUX_PERF_COUNTER_H */ | ||
diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 48d887e3c6e7..b00df4c79c63 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h | |||
@@ -85,4 +85,7 @@ | |||
85 | #define PR_SET_TIMERSLACK 29 | 85 | #define PR_SET_TIMERSLACK 29 |
86 | #define PR_GET_TIMERSLACK 30 | 86 | #define PR_GET_TIMERSLACK 30 |
87 | 87 | ||
88 | #define PR_TASK_PERF_COUNTERS_DISABLE 31 | ||
89 | #define PR_TASK_PERF_COUNTERS_ENABLE 32 | ||
90 | |||
88 | #endif /* _LINUX_PRCTL_H */ | 91 | #endif /* _LINUX_PRCTL_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index b94f3541f67b..7ed41f7c5ace 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -71,6 +71,7 @@ struct sched_param { | |||
71 | #include <linux/path.h> | 71 | #include <linux/path.h> |
72 | #include <linux/compiler.h> | 72 | #include <linux/compiler.h> |
73 | #include <linux/completion.h> | 73 | #include <linux/completion.h> |
74 | #include <linux/perf_counter.h> | ||
74 | #include <linux/pid.h> | 75 | #include <linux/pid.h> |
75 | #include <linux/percpu.h> | 76 | #include <linux/percpu.h> |
76 | #include <linux/topology.h> | 77 | #include <linux/topology.h> |
@@ -137,6 +138,7 @@ extern unsigned long nr_running(void); | |||
137 | extern unsigned long nr_uninterruptible(void); | 138 | extern unsigned long nr_uninterruptible(void); |
138 | extern unsigned long nr_active(void); | 139 | extern unsigned long nr_active(void); |
139 | extern unsigned long nr_iowait(void); | 140 | extern unsigned long nr_iowait(void); |
141 | extern u64 cpu_nr_migrations(int cpu); | ||
140 | 142 | ||
141 | extern unsigned long get_parent_ip(unsigned long addr); | 143 | extern unsigned long get_parent_ip(unsigned long addr); |
142 | 144 | ||
@@ -1048,9 +1050,10 @@ struct sched_entity { | |||
1048 | u64 last_wakeup; | 1050 | u64 last_wakeup; |
1049 | u64 avg_overlap; | 1051 | u64 avg_overlap; |
1050 | 1052 | ||
1053 | u64 nr_migrations; | ||
1054 | |||
1051 | u64 start_runtime; | 1055 | u64 start_runtime; |
1052 | u64 avg_wakeup; | 1056 | u64 avg_wakeup; |
1053 | u64 nr_migrations; | ||
1054 | 1057 | ||
1055 | #ifdef CONFIG_SCHEDSTATS | 1058 | #ifdef CONFIG_SCHEDSTATS |
1056 | u64 wait_start; | 1059 | u64 wait_start; |
@@ -1372,6 +1375,7 @@ struct task_struct { | |||
1372 | struct list_head pi_state_list; | 1375 | struct list_head pi_state_list; |
1373 | struct futex_pi_state *pi_state_cache; | 1376 | struct futex_pi_state *pi_state_cache; |
1374 | #endif | 1377 | #endif |
1378 | struct perf_counter_context perf_counter_ctx; | ||
1375 | #ifdef CONFIG_NUMA | 1379 | #ifdef CONFIG_NUMA |
1376 | struct mempolicy *mempolicy; | 1380 | struct mempolicy *mempolicy; |
1377 | short il_next; | 1381 | short il_next; |
@@ -2380,6 +2384,13 @@ static inline void inc_syscw(struct task_struct *tsk) | |||
2380 | #define TASK_SIZE_OF(tsk) TASK_SIZE | 2384 | #define TASK_SIZE_OF(tsk) TASK_SIZE |
2381 | #endif | 2385 | #endif |
2382 | 2386 | ||
2387 | /* | ||
2388 | * Call the function if the target task is executing on a CPU right now: | ||
2389 | */ | ||
2390 | extern void task_oncpu_function_call(struct task_struct *p, | ||
2391 | void (*func) (void *info), void *info); | ||
2392 | |||
2393 | |||
2383 | #ifdef CONFIG_MM_OWNER | 2394 | #ifdef CONFIG_MM_OWNER |
2384 | extern void mm_update_next_owner(struct mm_struct *mm); | 2395 | extern void mm_update_next_owner(struct mm_struct *mm); |
2385 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); | 2396 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6470f74074af..471143bf2aae 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -55,6 +55,7 @@ struct compat_timeval; | |||
55 | struct robust_list_head; | 55 | struct robust_list_head; |
56 | struct getcpu_cache; | 56 | struct getcpu_cache; |
57 | struct old_linux_dirent; | 57 | struct old_linux_dirent; |
58 | struct perf_counter_hw_event; | ||
58 | 59 | ||
59 | #include <linux/types.h> | 60 | #include <linux/types.h> |
60 | #include <linux/aio_abi.h> | 61 | #include <linux/aio_abi.h> |
@@ -754,4 +755,8 @@ asmlinkage long sys_pipe(int __user *); | |||
754 | 755 | ||
755 | int kernel_execve(const char *filename, char *const argv[], char *const envp[]); | 756 | int kernel_execve(const char *filename, char *const argv[], char *const envp[]); |
756 | 757 | ||
758 | |||
759 | asmlinkage long sys_perf_counter_open( | ||
760 | const struct perf_counter_hw_event __user *hw_event_uptr, | ||
761 | pid_t pid, int cpu, int group_fd, unsigned long flags); | ||
757 | #endif | 762 | #endif |
diff --git a/init/Kconfig b/init/Kconfig index c52d1d48272a..35659ed442e5 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -919,6 +919,41 @@ config AIO | |||
919 | by some high performance threaded applications. Disabling | 919 | by some high performance threaded applications. Disabling |
920 | this option saves about 7k. | 920 | this option saves about 7k. |
921 | 921 | ||
922 | config HAVE_PERF_COUNTERS | ||
923 | bool | ||
924 | |||
925 | menu "Performance Counters" | ||
926 | |||
927 | config PERF_COUNTERS | ||
928 | bool "Kernel Performance Counters" | ||
929 | depends on HAVE_PERF_COUNTERS | ||
930 | default y | ||
931 | select ANON_INODES | ||
932 | help | ||
933 | Enable kernel support for performance counter hardware. | ||
934 | |||
935 | Performance counters are special hardware registers available | ||
936 | on most modern CPUs. These registers count the number of certain | ||
937 | types of hw events: such as instructions executed, cachemisses | ||
938 | suffered, or branches mis-predicted - without slowing down the | ||
939 | kernel or applications. These registers can also trigger interrupts | ||
940 | when a threshold number of events have passed - and can thus be | ||
941 | used to profile the code that runs on that CPU. | ||
942 | |||
943 | The Linux Performance Counter subsystem provides an abstraction of | ||
944 | these hardware capabilities, available via a system call. It | ||
945 | provides per task and per CPU counters, and it provides event | ||
946 | capabilities on top of those. | ||
947 | |||
948 | Say Y if unsure. | ||
949 | |||
950 | config EVENT_PROFILE | ||
951 | bool "Tracepoint profile sources" | ||
952 | depends on PERF_COUNTERS && EVENT_TRACER | ||
953 | default y | ||
954 | |||
955 | endmenu | ||
956 | |||
922 | config VM_EVENT_COUNTERS | 957 | config VM_EVENT_COUNTERS |
923 | default y | 958 | default y |
924 | bool "Enable VM event counters for /proc/vmstat" if EMBEDDED | 959 | bool "Enable VM event counters for /proc/vmstat" if EMBEDDED |
diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e9..63c697529ca1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -94,6 +94,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ | |||
94 | obj-$(CONFIG_TRACING) += trace/ | 94 | obj-$(CONFIG_TRACING) += trace/ |
95 | obj-$(CONFIG_SMP) += sched_cpupri.o | 95 | obj-$(CONFIG_SMP) += sched_cpupri.o |
96 | obj-$(CONFIG_SLOW_WORK) += slow-work.o | 96 | obj-$(CONFIG_SLOW_WORK) += slow-work.o |
97 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o | ||
97 | 98 | ||
98 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 99 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
99 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 100 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/exit.c b/kernel/exit.c index 32cbf2607cb0..fbb5d94c8bbc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -158,6 +158,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
158 | { | 158 | { |
159 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 159 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
160 | 160 | ||
161 | #ifdef CONFIG_PERF_COUNTERS | ||
162 | WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); | ||
163 | #endif | ||
161 | trace_sched_process_free(tsk); | 164 | trace_sched_process_free(tsk); |
162 | put_task_struct(tsk); | 165 | put_task_struct(tsk); |
163 | } | 166 | } |
@@ -979,10 +982,6 @@ NORET_TYPE void do_exit(long code) | |||
979 | tsk->mempolicy = NULL; | 982 | tsk->mempolicy = NULL; |
980 | #endif | 983 | #endif |
981 | #ifdef CONFIG_FUTEX | 984 | #ifdef CONFIG_FUTEX |
982 | /* | ||
983 | * This must happen late, after the PID is not | ||
984 | * hashed anymore: | ||
985 | */ | ||
986 | if (unlikely(!list_empty(&tsk->pi_state_list))) | 985 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
987 | exit_pi_state_list(tsk); | 986 | exit_pi_state_list(tsk); |
988 | if (unlikely(current->pi_state_cache)) | 987 | if (unlikely(current->pi_state_cache)) |
@@ -1249,6 +1248,12 @@ static int wait_task_zombie(struct task_struct *p, int options, | |||
1249 | */ | 1248 | */ |
1250 | read_unlock(&tasklist_lock); | 1249 | read_unlock(&tasklist_lock); |
1251 | 1250 | ||
1251 | /* | ||
1252 | * Flush inherited counters to the parent - before the parent | ||
1253 | * gets woken up by child-exit notifications. | ||
1254 | */ | ||
1255 | perf_counter_exit_task(p); | ||
1256 | |||
1252 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; | 1257 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; |
1253 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1258 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
1254 | ? p->signal->group_exit_code : p->exit_code; | 1259 | ? p->signal->group_exit_code : p->exit_code; |
diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765bc..381d7f9b70fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
975 | goto fork_out; | 975 | goto fork_out; |
976 | 976 | ||
977 | rt_mutex_init_task(p); | 977 | rt_mutex_init_task(p); |
978 | perf_counter_init_task(p); | ||
978 | 979 | ||
979 | #ifdef CONFIG_PROVE_LOCKING | 980 | #ifdef CONFIG_PROVE_LOCKING |
980 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | 981 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a3..fd95eaa672e6 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count); | |||
89 | * | 89 | * |
90 | * This function is similar to (but not equivalent to) down(). | 90 | * This function is similar to (but not equivalent to) down(). |
91 | */ | 91 | */ |
92 | void inline __sched mutex_lock(struct mutex *lock) | 92 | void __sched mutex_lock(struct mutex *lock) |
93 | { | 93 | { |
94 | might_sleep(); | 94 | might_sleep(); |
95 | /* | 95 | /* |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 000000000000..863703b3158f --- /dev/null +++ b/kernel/perf_counter.c | |||
@@ -0,0 +1,3150 @@ | |||
1 | /* | ||
2 | * Performance counter core code | ||
3 | * | ||
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | ||
6 | * | ||
7 | * | ||
8 | * For licensing details see kernel-base/COPYING | ||
9 | */ | ||
10 | |||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/cpu.h> | ||
14 | #include <linux/smp.h> | ||
15 | #include <linux/file.h> | ||
16 | #include <linux/poll.h> | ||
17 | #include <linux/sysfs.h> | ||
18 | #include <linux/ptrace.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/vmstat.h> | ||
21 | #include <linux/hardirq.h> | ||
22 | #include <linux/rculist.h> | ||
23 | #include <linux/uaccess.h> | ||
24 | #include <linux/syscalls.h> | ||
25 | #include <linux/anon_inodes.h> | ||
26 | #include <linux/kernel_stat.h> | ||
27 | #include <linux/perf_counter.h> | ||
28 | #include <linux/dcache.h> | ||
29 | |||
30 | #include <asm/irq_regs.h> | ||
31 | |||
32 | /* | ||
33 | * Each CPU has a list of per CPU counters: | ||
34 | */ | ||
35 | DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
36 | |||
37 | int perf_max_counters __read_mostly = 1; | ||
38 | static int perf_reserved_percpu __read_mostly; | ||
39 | static int perf_overcommit __read_mostly = 1; | ||
40 | |||
41 | /* | ||
42 | * Mutex for (sysadmin-configurable) counter reservations: | ||
43 | */ | ||
44 | static DEFINE_MUTEX(perf_resource_mutex); | ||
45 | |||
46 | /* | ||
47 | * Architecture provided APIs - weak aliases: | ||
48 | */ | ||
49 | extern __weak const struct hw_perf_counter_ops * | ||
50 | hw_perf_counter_init(struct perf_counter *counter) | ||
51 | { | ||
52 | return NULL; | ||
53 | } | ||
54 | |||
55 | u64 __weak hw_perf_save_disable(void) { return 0; } | ||
56 | void __weak hw_perf_restore(u64 ctrl) { barrier(); } | ||
57 | void __weak hw_perf_counter_setup(int cpu) { barrier(); } | ||
58 | int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, | ||
59 | struct perf_cpu_context *cpuctx, | ||
60 | struct perf_counter_context *ctx, int cpu) | ||
61 | { | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | void __weak perf_counter_print_debug(void) { } | ||
66 | |||
67 | static void | ||
68 | list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | ||
69 | { | ||
70 | struct perf_counter *group_leader = counter->group_leader; | ||
71 | |||
72 | /* | ||
73 | * Depending on whether it is a standalone or sibling counter, | ||
74 | * add it straight to the context's counter list, or to the group | ||
75 | * leader's sibling list: | ||
76 | */ | ||
77 | if (counter->group_leader == counter) | ||
78 | list_add_tail(&counter->list_entry, &ctx->counter_list); | ||
79 | else { | ||
80 | list_add_tail(&counter->list_entry, &group_leader->sibling_list); | ||
81 | group_leader->nr_siblings++; | ||
82 | } | ||
83 | |||
84 | list_add_rcu(&counter->event_entry, &ctx->event_list); | ||
85 | } | ||
86 | |||
87 | static void | ||
88 | list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | ||
89 | { | ||
90 | struct perf_counter *sibling, *tmp; | ||
91 | |||
92 | list_del_init(&counter->list_entry); | ||
93 | list_del_rcu(&counter->event_entry); | ||
94 | |||
95 | if (counter->group_leader != counter) | ||
96 | counter->group_leader->nr_siblings--; | ||
97 | |||
98 | /* | ||
99 | * If this was a group counter with sibling counters then | ||
100 | * upgrade the siblings to singleton counters by adding them | ||
101 | * to the context list directly: | ||
102 | */ | ||
103 | list_for_each_entry_safe(sibling, tmp, | ||
104 | &counter->sibling_list, list_entry) { | ||
105 | |||
106 | list_move_tail(&sibling->list_entry, &ctx->counter_list); | ||
107 | sibling->group_leader = sibling; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | static void | ||
112 | counter_sched_out(struct perf_counter *counter, | ||
113 | struct perf_cpu_context *cpuctx, | ||
114 | struct perf_counter_context *ctx) | ||
115 | { | ||
116 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) | ||
117 | return; | ||
118 | |||
119 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
120 | counter->tstamp_stopped = ctx->time; | ||
121 | counter->hw_ops->disable(counter); | ||
122 | counter->oncpu = -1; | ||
123 | |||
124 | if (!is_software_counter(counter)) | ||
125 | cpuctx->active_oncpu--; | ||
126 | ctx->nr_active--; | ||
127 | if (counter->hw_event.exclusive || !cpuctx->active_oncpu) | ||
128 | cpuctx->exclusive = 0; | ||
129 | } | ||
130 | |||
131 | static void | ||
132 | group_sched_out(struct perf_counter *group_counter, | ||
133 | struct perf_cpu_context *cpuctx, | ||
134 | struct perf_counter_context *ctx) | ||
135 | { | ||
136 | struct perf_counter *counter; | ||
137 | |||
138 | if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) | ||
139 | return; | ||
140 | |||
141 | counter_sched_out(group_counter, cpuctx, ctx); | ||
142 | |||
143 | /* | ||
144 | * Schedule out siblings (if any): | ||
145 | */ | ||
146 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) | ||
147 | counter_sched_out(counter, cpuctx, ctx); | ||
148 | |||
149 | if (group_counter->hw_event.exclusive) | ||
150 | cpuctx->exclusive = 0; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Cross CPU call to remove a performance counter | ||
155 | * | ||
156 | * We disable the counter on the hardware level first. After that we | ||
157 | * remove it from the context list. | ||
158 | */ | ||
159 | static void __perf_counter_remove_from_context(void *info) | ||
160 | { | ||
161 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
162 | struct perf_counter *counter = info; | ||
163 | struct perf_counter_context *ctx = counter->ctx; | ||
164 | unsigned long flags; | ||
165 | u64 perf_flags; | ||
166 | |||
167 | /* | ||
168 | * If this is a task context, we need to check whether it is | ||
169 | * the current task context of this cpu. If not it has been | ||
170 | * scheduled out before the smp call arrived. | ||
171 | */ | ||
172 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
173 | return; | ||
174 | |||
175 | spin_lock_irqsave(&ctx->lock, flags); | ||
176 | |||
177 | counter_sched_out(counter, cpuctx, ctx); | ||
178 | |||
179 | counter->task = NULL; | ||
180 | ctx->nr_counters--; | ||
181 | |||
182 | /* | ||
183 | * Protect the list operation against NMI by disabling the | ||
184 | * counters on a global level. NOP for non NMI based counters. | ||
185 | */ | ||
186 | perf_flags = hw_perf_save_disable(); | ||
187 | list_del_counter(counter, ctx); | ||
188 | hw_perf_restore(perf_flags); | ||
189 | |||
190 | if (!ctx->task) { | ||
191 | /* | ||
192 | * Allow more per task counters with respect to the | ||
193 | * reservation: | ||
194 | */ | ||
195 | cpuctx->max_pertask = | ||
196 | min(perf_max_counters - ctx->nr_counters, | ||
197 | perf_max_counters - perf_reserved_percpu); | ||
198 | } | ||
199 | |||
200 | spin_unlock_irqrestore(&ctx->lock, flags); | ||
201 | } | ||
202 | |||
203 | |||
204 | /* | ||
205 | * Remove the counter from a task's (or a CPU's) list of counters. | ||
206 | * | ||
207 | * Must be called with counter->mutex and ctx->mutex held. | ||
208 | * | ||
209 | * CPU counters are removed with a smp call. For task counters we only | ||
210 | * call when the task is on a CPU. | ||
211 | */ | ||
212 | static void perf_counter_remove_from_context(struct perf_counter *counter) | ||
213 | { | ||
214 | struct perf_counter_context *ctx = counter->ctx; | ||
215 | struct task_struct *task = ctx->task; | ||
216 | |||
217 | if (!task) { | ||
218 | /* | ||
219 | * Per cpu counters are removed via an smp call and | ||
220 | * the removal is always sucessful. | ||
221 | */ | ||
222 | smp_call_function_single(counter->cpu, | ||
223 | __perf_counter_remove_from_context, | ||
224 | counter, 1); | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | retry: | ||
229 | task_oncpu_function_call(task, __perf_counter_remove_from_context, | ||
230 | counter); | ||
231 | |||
232 | spin_lock_irq(&ctx->lock); | ||
233 | /* | ||
234 | * If the context is active we need to retry the smp call. | ||
235 | */ | ||
236 | if (ctx->nr_active && !list_empty(&counter->list_entry)) { | ||
237 | spin_unlock_irq(&ctx->lock); | ||
238 | goto retry; | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * The lock prevents that this context is scheduled in so we | ||
243 | * can remove the counter safely, if the call above did not | ||
244 | * succeed. | ||
245 | */ | ||
246 | if (!list_empty(&counter->list_entry)) { | ||
247 | ctx->nr_counters--; | ||
248 | list_del_counter(counter, ctx); | ||
249 | counter->task = NULL; | ||
250 | } | ||
251 | spin_unlock_irq(&ctx->lock); | ||
252 | } | ||
253 | |||
254 | static inline u64 perf_clock(void) | ||
255 | { | ||
256 | return cpu_clock(smp_processor_id()); | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Update the record of the current time in a context. | ||
261 | */ | ||
262 | static void update_context_time(struct perf_counter_context *ctx) | ||
263 | { | ||
264 | u64 now = perf_clock(); | ||
265 | |||
266 | ctx->time += now - ctx->timestamp; | ||
267 | ctx->timestamp = now; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Update the total_time_enabled and total_time_running fields for a counter. | ||
272 | */ | ||
273 | static void update_counter_times(struct perf_counter *counter) | ||
274 | { | ||
275 | struct perf_counter_context *ctx = counter->ctx; | ||
276 | u64 run_end; | ||
277 | |||
278 | if (counter->state < PERF_COUNTER_STATE_INACTIVE) | ||
279 | return; | ||
280 | |||
281 | counter->total_time_enabled = ctx->time - counter->tstamp_enabled; | ||
282 | |||
283 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) | ||
284 | run_end = counter->tstamp_stopped; | ||
285 | else | ||
286 | run_end = ctx->time; | ||
287 | |||
288 | counter->total_time_running = run_end - counter->tstamp_running; | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * Update total_time_enabled and total_time_running for all counters in a group. | ||
293 | */ | ||
294 | static void update_group_times(struct perf_counter *leader) | ||
295 | { | ||
296 | struct perf_counter *counter; | ||
297 | |||
298 | update_counter_times(leader); | ||
299 | list_for_each_entry(counter, &leader->sibling_list, list_entry) | ||
300 | update_counter_times(counter); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Cross CPU call to disable a performance counter | ||
305 | */ | ||
306 | static void __perf_counter_disable(void *info) | ||
307 | { | ||
308 | struct perf_counter *counter = info; | ||
309 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
310 | struct perf_counter_context *ctx = counter->ctx; | ||
311 | unsigned long flags; | ||
312 | |||
313 | /* | ||
314 | * If this is a per-task counter, need to check whether this | ||
315 | * counter's task is the current task on this cpu. | ||
316 | */ | ||
317 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
318 | return; | ||
319 | |||
320 | spin_lock_irqsave(&ctx->lock, flags); | ||
321 | |||
322 | update_context_time(ctx); | ||
323 | |||
324 | /* | ||
325 | * If the counter is on, turn it off. | ||
326 | * If it is in error state, leave it in error state. | ||
327 | */ | ||
328 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { | ||
329 | update_context_time(ctx); | ||
330 | update_counter_times(counter); | ||
331 | if (counter == counter->group_leader) | ||
332 | group_sched_out(counter, cpuctx, ctx); | ||
333 | else | ||
334 | counter_sched_out(counter, cpuctx, ctx); | ||
335 | counter->state = PERF_COUNTER_STATE_OFF; | ||
336 | } | ||
337 | |||
338 | spin_unlock_irqrestore(&ctx->lock, flags); | ||
339 | } | ||
340 | |||
341 | /* | ||
342 | * Disable a counter. | ||
343 | */ | ||
344 | static void perf_counter_disable(struct perf_counter *counter) | ||
345 | { | ||
346 | struct perf_counter_context *ctx = counter->ctx; | ||
347 | struct task_struct *task = ctx->task; | ||
348 | |||
349 | if (!task) { | ||
350 | /* | ||
351 | * Disable the counter on the cpu that it's on | ||
352 | */ | ||
353 | smp_call_function_single(counter->cpu, __perf_counter_disable, | ||
354 | counter, 1); | ||
355 | return; | ||
356 | } | ||
357 | |||
358 | retry: | ||
359 | task_oncpu_function_call(task, __perf_counter_disable, counter); | ||
360 | |||
361 | spin_lock_irq(&ctx->lock); | ||
362 | /* | ||
363 | * If the counter is still active, we need to retry the cross-call. | ||
364 | */ | ||
365 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
366 | spin_unlock_irq(&ctx->lock); | ||
367 | goto retry; | ||
368 | } | ||
369 | |||
370 | /* | ||
371 | * Since we have the lock this context can't be scheduled | ||
372 | * in, so we can change the state safely. | ||
373 | */ | ||
374 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) { | ||
375 | update_counter_times(counter); | ||
376 | counter->state = PERF_COUNTER_STATE_OFF; | ||
377 | } | ||
378 | |||
379 | spin_unlock_irq(&ctx->lock); | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Disable a counter and all its children. | ||
384 | */ | ||
385 | static void perf_counter_disable_family(struct perf_counter *counter) | ||
386 | { | ||
387 | struct perf_counter *child; | ||
388 | |||
389 | perf_counter_disable(counter); | ||
390 | |||
391 | /* | ||
392 | * Lock the mutex to protect the list of children | ||
393 | */ | ||
394 | mutex_lock(&counter->mutex); | ||
395 | list_for_each_entry(child, &counter->child_list, child_list) | ||
396 | perf_counter_disable(child); | ||
397 | mutex_unlock(&counter->mutex); | ||
398 | } | ||
399 | |||
400 | static int | ||
401 | counter_sched_in(struct perf_counter *counter, | ||
402 | struct perf_cpu_context *cpuctx, | ||
403 | struct perf_counter_context *ctx, | ||
404 | int cpu) | ||
405 | { | ||
406 | if (counter->state <= PERF_COUNTER_STATE_OFF) | ||
407 | return 0; | ||
408 | |||
409 | counter->state = PERF_COUNTER_STATE_ACTIVE; | ||
410 | counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ | ||
411 | /* | ||
412 | * The new state must be visible before we turn it on in the hardware: | ||
413 | */ | ||
414 | smp_wmb(); | ||
415 | |||
416 | if (counter->hw_ops->enable(counter)) { | ||
417 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
418 | counter->oncpu = -1; | ||
419 | return -EAGAIN; | ||
420 | } | ||
421 | |||
422 | counter->tstamp_running += ctx->time - counter->tstamp_stopped; | ||
423 | |||
424 | if (!is_software_counter(counter)) | ||
425 | cpuctx->active_oncpu++; | ||
426 | ctx->nr_active++; | ||
427 | |||
428 | if (counter->hw_event.exclusive) | ||
429 | cpuctx->exclusive = 1; | ||
430 | |||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Return 1 for a group consisting entirely of software counters, | ||
436 | * 0 if the group contains any hardware counters. | ||
437 | */ | ||
438 | static int is_software_only_group(struct perf_counter *leader) | ||
439 | { | ||
440 | struct perf_counter *counter; | ||
441 | |||
442 | if (!is_software_counter(leader)) | ||
443 | return 0; | ||
444 | |||
445 | list_for_each_entry(counter, &leader->sibling_list, list_entry) | ||
446 | if (!is_software_counter(counter)) | ||
447 | return 0; | ||
448 | |||
449 | return 1; | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Work out whether we can put this counter group on the CPU now. | ||
454 | */ | ||
455 | static int group_can_go_on(struct perf_counter *counter, | ||
456 | struct perf_cpu_context *cpuctx, | ||
457 | int can_add_hw) | ||
458 | { | ||
459 | /* | ||
460 | * Groups consisting entirely of software counters can always go on. | ||
461 | */ | ||
462 | if (is_software_only_group(counter)) | ||
463 | return 1; | ||
464 | /* | ||
465 | * If an exclusive group is already on, no other hardware | ||
466 | * counters can go on. | ||
467 | */ | ||
468 | if (cpuctx->exclusive) | ||
469 | return 0; | ||
470 | /* | ||
471 | * If this group is exclusive and there are already | ||
472 | * counters on the CPU, it can't go on. | ||
473 | */ | ||
474 | if (counter->hw_event.exclusive && cpuctx->active_oncpu) | ||
475 | return 0; | ||
476 | /* | ||
477 | * Otherwise, try to add it if all previous groups were able | ||
478 | * to go on. | ||
479 | */ | ||
480 | return can_add_hw; | ||
481 | } | ||
482 | |||
483 | static void add_counter_to_ctx(struct perf_counter *counter, | ||
484 | struct perf_counter_context *ctx) | ||
485 | { | ||
486 | list_add_counter(counter, ctx); | ||
487 | ctx->nr_counters++; | ||
488 | counter->prev_state = PERF_COUNTER_STATE_OFF; | ||
489 | counter->tstamp_enabled = ctx->time; | ||
490 | counter->tstamp_running = ctx->time; | ||
491 | counter->tstamp_stopped = ctx->time; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Cross CPU call to install and enable a performance counter | ||
496 | */ | ||
497 | static void __perf_install_in_context(void *info) | ||
498 | { | ||
499 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
500 | struct perf_counter *counter = info; | ||
501 | struct perf_counter_context *ctx = counter->ctx; | ||
502 | struct perf_counter *leader = counter->group_leader; | ||
503 | int cpu = smp_processor_id(); | ||
504 | unsigned long flags; | ||
505 | u64 perf_flags; | ||
506 | int err; | ||
507 | |||
508 | /* | ||
509 | * If this is a task context, we need to check whether it is | ||
510 | * the current task context of this cpu. If not it has been | ||
511 | * scheduled out before the smp call arrived. | ||
512 | */ | ||
513 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
514 | return; | ||
515 | |||
516 | spin_lock_irqsave(&ctx->lock, flags); | ||
517 | update_context_time(ctx); | ||
518 | |||
519 | /* | ||
520 | * Protect the list operation against NMI by disabling the | ||
521 | * counters on a global level. NOP for non NMI based counters. | ||
522 | */ | ||
523 | perf_flags = hw_perf_save_disable(); | ||
524 | |||
525 | add_counter_to_ctx(counter, ctx); | ||
526 | |||
527 | /* | ||
528 | * Don't put the counter on if it is disabled or if | ||
529 | * it is in a group and the group isn't on. | ||
530 | */ | ||
531 | if (counter->state != PERF_COUNTER_STATE_INACTIVE || | ||
532 | (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) | ||
533 | goto unlock; | ||
534 | |||
535 | /* | ||
536 | * An exclusive counter can't go on if there are already active | ||
537 | * hardware counters, and no hardware counter can go on if there | ||
538 | * is already an exclusive counter on. | ||
539 | */ | ||
540 | if (!group_can_go_on(counter, cpuctx, 1)) | ||
541 | err = -EEXIST; | ||
542 | else | ||
543 | err = counter_sched_in(counter, cpuctx, ctx, cpu); | ||
544 | |||
545 | if (err) { | ||
546 | /* | ||
547 | * This counter couldn't go on. If it is in a group | ||
548 | * then we have to pull the whole group off. | ||
549 | * If the counter group is pinned then put it in error state. | ||
550 | */ | ||
551 | if (leader != counter) | ||
552 | group_sched_out(leader, cpuctx, ctx); | ||
553 | if (leader->hw_event.pinned) { | ||
554 | update_group_times(leader); | ||
555 | leader->state = PERF_COUNTER_STATE_ERROR; | ||
556 | } | ||
557 | } | ||
558 | |||
559 | if (!err && !ctx->task && cpuctx->max_pertask) | ||
560 | cpuctx->max_pertask--; | ||
561 | |||
562 | unlock: | ||
563 | hw_perf_restore(perf_flags); | ||
564 | |||
565 | spin_unlock_irqrestore(&ctx->lock, flags); | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Attach a performance counter to a context | ||
570 | * | ||
571 | * First we add the counter to the list with the hardware enable bit | ||
572 | * in counter->hw_config cleared. | ||
573 | * | ||
574 | * If the counter is attached to a task which is on a CPU we use a smp | ||
575 | * call to enable it in the task context. The task might have been | ||
576 | * scheduled away, but we check this in the smp call again. | ||
577 | * | ||
578 | * Must be called with ctx->mutex held. | ||
579 | */ | ||
580 | static void | ||
581 | perf_install_in_context(struct perf_counter_context *ctx, | ||
582 | struct perf_counter *counter, | ||
583 | int cpu) | ||
584 | { | ||
585 | struct task_struct *task = ctx->task; | ||
586 | |||
587 | if (!task) { | ||
588 | /* | ||
589 | * Per cpu counters are installed via an smp call and | ||
590 | * the install is always sucessful. | ||
591 | */ | ||
592 | smp_call_function_single(cpu, __perf_install_in_context, | ||
593 | counter, 1); | ||
594 | return; | ||
595 | } | ||
596 | |||
597 | counter->task = task; | ||
598 | retry: | ||
599 | task_oncpu_function_call(task, __perf_install_in_context, | ||
600 | counter); | ||
601 | |||
602 | spin_lock_irq(&ctx->lock); | ||
603 | /* | ||
604 | * we need to retry the smp call. | ||
605 | */ | ||
606 | if (ctx->is_active && list_empty(&counter->list_entry)) { | ||
607 | spin_unlock_irq(&ctx->lock); | ||
608 | goto retry; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * The lock prevents that this context is scheduled in so we | ||
613 | * can add the counter safely, if it the call above did not | ||
614 | * succeed. | ||
615 | */ | ||
616 | if (list_empty(&counter->list_entry)) | ||
617 | add_counter_to_ctx(counter, ctx); | ||
618 | spin_unlock_irq(&ctx->lock); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Cross CPU call to enable a performance counter | ||
623 | */ | ||
624 | static void __perf_counter_enable(void *info) | ||
625 | { | ||
626 | struct perf_counter *counter = info; | ||
627 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
628 | struct perf_counter_context *ctx = counter->ctx; | ||
629 | struct perf_counter *leader = counter->group_leader; | ||
630 | unsigned long flags; | ||
631 | int err; | ||
632 | |||
633 | /* | ||
634 | * If this is a per-task counter, need to check whether this | ||
635 | * counter's task is the current task on this cpu. | ||
636 | */ | ||
637 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
638 | return; | ||
639 | |||
640 | spin_lock_irqsave(&ctx->lock, flags); | ||
641 | update_context_time(ctx); | ||
642 | |||
643 | counter->prev_state = counter->state; | ||
644 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
645 | goto unlock; | ||
646 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
647 | counter->tstamp_enabled = ctx->time - counter->total_time_enabled; | ||
648 | |||
649 | /* | ||
650 | * If the counter is in a group and isn't the group leader, | ||
651 | * then don't put it on unless the group is on. | ||
652 | */ | ||
653 | if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) | ||
654 | goto unlock; | ||
655 | |||
656 | if (!group_can_go_on(counter, cpuctx, 1)) | ||
657 | err = -EEXIST; | ||
658 | else | ||
659 | err = counter_sched_in(counter, cpuctx, ctx, | ||
660 | smp_processor_id()); | ||
661 | |||
662 | if (err) { | ||
663 | /* | ||
664 | * If this counter can't go on and it's part of a | ||
665 | * group, then the whole group has to come off. | ||
666 | */ | ||
667 | if (leader != counter) | ||
668 | group_sched_out(leader, cpuctx, ctx); | ||
669 | if (leader->hw_event.pinned) { | ||
670 | update_group_times(leader); | ||
671 | leader->state = PERF_COUNTER_STATE_ERROR; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | unlock: | ||
676 | spin_unlock_irqrestore(&ctx->lock, flags); | ||
677 | } | ||
678 | |||
679 | /* | ||
680 | * Enable a counter. | ||
681 | */ | ||
682 | static void perf_counter_enable(struct perf_counter *counter) | ||
683 | { | ||
684 | struct perf_counter_context *ctx = counter->ctx; | ||
685 | struct task_struct *task = ctx->task; | ||
686 | |||
687 | if (!task) { | ||
688 | /* | ||
689 | * Enable the counter on the cpu that it's on | ||
690 | */ | ||
691 | smp_call_function_single(counter->cpu, __perf_counter_enable, | ||
692 | counter, 1); | ||
693 | return; | ||
694 | } | ||
695 | |||
696 | spin_lock_irq(&ctx->lock); | ||
697 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
698 | goto out; | ||
699 | |||
700 | /* | ||
701 | * If the counter is in error state, clear that first. | ||
702 | * That way, if we see the counter in error state below, we | ||
703 | * know that it has gone back into error state, as distinct | ||
704 | * from the task having been scheduled away before the | ||
705 | * cross-call arrived. | ||
706 | */ | ||
707 | if (counter->state == PERF_COUNTER_STATE_ERROR) | ||
708 | counter->state = PERF_COUNTER_STATE_OFF; | ||
709 | |||
710 | retry: | ||
711 | spin_unlock_irq(&ctx->lock); | ||
712 | task_oncpu_function_call(task, __perf_counter_enable, counter); | ||
713 | |||
714 | spin_lock_irq(&ctx->lock); | ||
715 | |||
716 | /* | ||
717 | * If the context is active and the counter is still off, | ||
718 | * we need to retry the cross-call. | ||
719 | */ | ||
720 | if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) | ||
721 | goto retry; | ||
722 | |||
723 | /* | ||
724 | * Since we have the lock this context can't be scheduled | ||
725 | * in, so we can change the state safely. | ||
726 | */ | ||
727 | if (counter->state == PERF_COUNTER_STATE_OFF) { | ||
728 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
729 | counter->tstamp_enabled = | ||
730 | ctx->time - counter->total_time_enabled; | ||
731 | } | ||
732 | out: | ||
733 | spin_unlock_irq(&ctx->lock); | ||
734 | } | ||
735 | |||
736 | static void perf_counter_refresh(struct perf_counter *counter, int refresh) | ||
737 | { | ||
738 | atomic_add(refresh, &counter->event_limit); | ||
739 | perf_counter_enable(counter); | ||
740 | } | ||
741 | |||
742 | /* | ||
743 | * Enable a counter and all its children. | ||
744 | */ | ||
745 | static void perf_counter_enable_family(struct perf_counter *counter) | ||
746 | { | ||
747 | struct perf_counter *child; | ||
748 | |||
749 | perf_counter_enable(counter); | ||
750 | |||
751 | /* | ||
752 | * Lock the mutex to protect the list of children | ||
753 | */ | ||
754 | mutex_lock(&counter->mutex); | ||
755 | list_for_each_entry(child, &counter->child_list, child_list) | ||
756 | perf_counter_enable(child); | ||
757 | mutex_unlock(&counter->mutex); | ||
758 | } | ||
759 | |||
760 | void __perf_counter_sched_out(struct perf_counter_context *ctx, | ||
761 | struct perf_cpu_context *cpuctx) | ||
762 | { | ||
763 | struct perf_counter *counter; | ||
764 | u64 flags; | ||
765 | |||
766 | spin_lock(&ctx->lock); | ||
767 | ctx->is_active = 0; | ||
768 | if (likely(!ctx->nr_counters)) | ||
769 | goto out; | ||
770 | update_context_time(ctx); | ||
771 | |||
772 | flags = hw_perf_save_disable(); | ||
773 | if (ctx->nr_active) { | ||
774 | list_for_each_entry(counter, &ctx->counter_list, list_entry) | ||
775 | group_sched_out(counter, cpuctx, ctx); | ||
776 | } | ||
777 | hw_perf_restore(flags); | ||
778 | out: | ||
779 | spin_unlock(&ctx->lock); | ||
780 | } | ||
781 | |||
782 | /* | ||
783 | * Called from scheduler to remove the counters of the current task, | ||
784 | * with interrupts disabled. | ||
785 | * | ||
786 | * We stop each counter and update the counter value in counter->count. | ||
787 | * | ||
788 | * This does not protect us against NMI, but disable() | ||
789 | * sets the disabled bit in the control field of counter _before_ | ||
790 | * accessing the counter control register. If a NMI hits, then it will | ||
791 | * not restart the counter. | ||
792 | */ | ||
793 | void perf_counter_task_sched_out(struct task_struct *task, int cpu) | ||
794 | { | ||
795 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
796 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
797 | struct pt_regs *regs; | ||
798 | |||
799 | if (likely(!cpuctx->task_ctx)) | ||
800 | return; | ||
801 | |||
802 | update_context_time(ctx); | ||
803 | |||
804 | regs = task_pt_regs(task); | ||
805 | perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); | ||
806 | __perf_counter_sched_out(ctx, cpuctx); | ||
807 | |||
808 | cpuctx->task_ctx = NULL; | ||
809 | } | ||
810 | |||
811 | static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) | ||
812 | { | ||
813 | __perf_counter_sched_out(&cpuctx->ctx, cpuctx); | ||
814 | } | ||
815 | |||
816 | static int | ||
817 | group_sched_in(struct perf_counter *group_counter, | ||
818 | struct perf_cpu_context *cpuctx, | ||
819 | struct perf_counter_context *ctx, | ||
820 | int cpu) | ||
821 | { | ||
822 | struct perf_counter *counter, *partial_group; | ||
823 | int ret; | ||
824 | |||
825 | if (group_counter->state == PERF_COUNTER_STATE_OFF) | ||
826 | return 0; | ||
827 | |||
828 | ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); | ||
829 | if (ret) | ||
830 | return ret < 0 ? ret : 0; | ||
831 | |||
832 | group_counter->prev_state = group_counter->state; | ||
833 | if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) | ||
834 | return -EAGAIN; | ||
835 | |||
836 | /* | ||
837 | * Schedule in siblings as one group (if any): | ||
838 | */ | ||
839 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { | ||
840 | counter->prev_state = counter->state; | ||
841 | if (counter_sched_in(counter, cpuctx, ctx, cpu)) { | ||
842 | partial_group = counter; | ||
843 | goto group_error; | ||
844 | } | ||
845 | } | ||
846 | |||
847 | return 0; | ||
848 | |||
849 | group_error: | ||
850 | /* | ||
851 | * Groups can be scheduled in as one unit only, so undo any | ||
852 | * partial group before returning: | ||
853 | */ | ||
854 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { | ||
855 | if (counter == partial_group) | ||
856 | break; | ||
857 | counter_sched_out(counter, cpuctx, ctx); | ||
858 | } | ||
859 | counter_sched_out(group_counter, cpuctx, ctx); | ||
860 | |||
861 | return -EAGAIN; | ||
862 | } | ||
863 | |||
864 | static void | ||
865 | __perf_counter_sched_in(struct perf_counter_context *ctx, | ||
866 | struct perf_cpu_context *cpuctx, int cpu) | ||
867 | { | ||
868 | struct perf_counter *counter; | ||
869 | u64 flags; | ||
870 | int can_add_hw = 1; | ||
871 | |||
872 | spin_lock(&ctx->lock); | ||
873 | ctx->is_active = 1; | ||
874 | if (likely(!ctx->nr_counters)) | ||
875 | goto out; | ||
876 | |||
877 | ctx->timestamp = perf_clock(); | ||
878 | |||
879 | flags = hw_perf_save_disable(); | ||
880 | |||
881 | /* | ||
882 | * First go through the list and put on any pinned groups | ||
883 | * in order to give them the best chance of going on. | ||
884 | */ | ||
885 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
886 | if (counter->state <= PERF_COUNTER_STATE_OFF || | ||
887 | !counter->hw_event.pinned) | ||
888 | continue; | ||
889 | if (counter->cpu != -1 && counter->cpu != cpu) | ||
890 | continue; | ||
891 | |||
892 | if (group_can_go_on(counter, cpuctx, 1)) | ||
893 | group_sched_in(counter, cpuctx, ctx, cpu); | ||
894 | |||
895 | /* | ||
896 | * If this pinned group hasn't been scheduled, | ||
897 | * put it in error state. | ||
898 | */ | ||
899 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) { | ||
900 | update_group_times(counter); | ||
901 | counter->state = PERF_COUNTER_STATE_ERROR; | ||
902 | } | ||
903 | } | ||
904 | |||
905 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
906 | /* | ||
907 | * Ignore counters in OFF or ERROR state, and | ||
908 | * ignore pinned counters since we did them already. | ||
909 | */ | ||
910 | if (counter->state <= PERF_COUNTER_STATE_OFF || | ||
911 | counter->hw_event.pinned) | ||
912 | continue; | ||
913 | |||
914 | /* | ||
915 | * Listen to the 'cpu' scheduling filter constraint | ||
916 | * of counters: | ||
917 | */ | ||
918 | if (counter->cpu != -1 && counter->cpu != cpu) | ||
919 | continue; | ||
920 | |||
921 | if (group_can_go_on(counter, cpuctx, can_add_hw)) { | ||
922 | if (group_sched_in(counter, cpuctx, ctx, cpu)) | ||
923 | can_add_hw = 0; | ||
924 | } | ||
925 | } | ||
926 | hw_perf_restore(flags); | ||
927 | out: | ||
928 | spin_unlock(&ctx->lock); | ||
929 | } | ||
930 | |||
931 | /* | ||
932 | * Called from scheduler to add the counters of the current task | ||
933 | * with interrupts disabled. | ||
934 | * | ||
935 | * We restore the counter value and then enable it. | ||
936 | * | ||
937 | * This does not protect us against NMI, but enable() | ||
938 | * sets the enabled bit in the control field of counter _before_ | ||
939 | * accessing the counter control register. If a NMI hits, then it will | ||
940 | * keep the counter running. | ||
941 | */ | ||
942 | void perf_counter_task_sched_in(struct task_struct *task, int cpu) | ||
943 | { | ||
944 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
945 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
946 | |||
947 | __perf_counter_sched_in(ctx, cpuctx, cpu); | ||
948 | cpuctx->task_ctx = ctx; | ||
949 | } | ||
950 | |||
951 | static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) | ||
952 | { | ||
953 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
954 | |||
955 | __perf_counter_sched_in(ctx, cpuctx, cpu); | ||
956 | } | ||
957 | |||
958 | int perf_counter_task_disable(void) | ||
959 | { | ||
960 | struct task_struct *curr = current; | ||
961 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
962 | struct perf_counter *counter; | ||
963 | unsigned long flags; | ||
964 | u64 perf_flags; | ||
965 | int cpu; | ||
966 | |||
967 | if (likely(!ctx->nr_counters)) | ||
968 | return 0; | ||
969 | |||
970 | local_irq_save(flags); | ||
971 | cpu = smp_processor_id(); | ||
972 | |||
973 | perf_counter_task_sched_out(curr, cpu); | ||
974 | |||
975 | spin_lock(&ctx->lock); | ||
976 | |||
977 | /* | ||
978 | * Disable all the counters: | ||
979 | */ | ||
980 | perf_flags = hw_perf_save_disable(); | ||
981 | |||
982 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
983 | if (counter->state != PERF_COUNTER_STATE_ERROR) { | ||
984 | update_group_times(counter); | ||
985 | counter->state = PERF_COUNTER_STATE_OFF; | ||
986 | } | ||
987 | } | ||
988 | |||
989 | hw_perf_restore(perf_flags); | ||
990 | |||
991 | spin_unlock_irqrestore(&ctx->lock, flags); | ||
992 | |||
993 | return 0; | ||
994 | } | ||
995 | |||
996 | int perf_counter_task_enable(void) | ||
997 | { | ||
998 | struct task_struct *curr = current; | ||
999 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
1000 | struct perf_counter *counter; | ||
1001 | unsigned long flags; | ||
1002 | u64 perf_flags; | ||
1003 | int cpu; | ||
1004 | |||
1005 | if (likely(!ctx->nr_counters)) | ||
1006 | return 0; | ||
1007 | |||
1008 | local_irq_save(flags); | ||
1009 | cpu = smp_processor_id(); | ||
1010 | |||
1011 | perf_counter_task_sched_out(curr, cpu); | ||
1012 | |||
1013 | spin_lock(&ctx->lock); | ||
1014 | |||
1015 | /* | ||
1016 | * Disable all the counters: | ||
1017 | */ | ||
1018 | perf_flags = hw_perf_save_disable(); | ||
1019 | |||
1020 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
1021 | if (counter->state > PERF_COUNTER_STATE_OFF) | ||
1022 | continue; | ||
1023 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
1024 | counter->tstamp_enabled = | ||
1025 | ctx->time - counter->total_time_enabled; | ||
1026 | counter->hw_event.disabled = 0; | ||
1027 | } | ||
1028 | hw_perf_restore(perf_flags); | ||
1029 | |||
1030 | spin_unlock(&ctx->lock); | ||
1031 | |||
1032 | perf_counter_task_sched_in(curr, cpu); | ||
1033 | |||
1034 | local_irq_restore(flags); | ||
1035 | |||
1036 | return 0; | ||
1037 | } | ||
1038 | |||
1039 | /* | ||
1040 | * Round-robin a context's counters: | ||
1041 | */ | ||
1042 | static void rotate_ctx(struct perf_counter_context *ctx) | ||
1043 | { | ||
1044 | struct perf_counter *counter; | ||
1045 | u64 perf_flags; | ||
1046 | |||
1047 | if (!ctx->nr_counters) | ||
1048 | return; | ||
1049 | |||
1050 | spin_lock(&ctx->lock); | ||
1051 | /* | ||
1052 | * Rotate the first entry last (works just fine for group counters too): | ||
1053 | */ | ||
1054 | perf_flags = hw_perf_save_disable(); | ||
1055 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
1056 | list_move_tail(&counter->list_entry, &ctx->counter_list); | ||
1057 | break; | ||
1058 | } | ||
1059 | hw_perf_restore(perf_flags); | ||
1060 | |||
1061 | spin_unlock(&ctx->lock); | ||
1062 | } | ||
1063 | |||
1064 | void perf_counter_task_tick(struct task_struct *curr, int cpu) | ||
1065 | { | ||
1066 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
1067 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
1068 | const int rotate_percpu = 0; | ||
1069 | |||
1070 | if (rotate_percpu) | ||
1071 | perf_counter_cpu_sched_out(cpuctx); | ||
1072 | perf_counter_task_sched_out(curr, cpu); | ||
1073 | |||
1074 | if (rotate_percpu) | ||
1075 | rotate_ctx(&cpuctx->ctx); | ||
1076 | rotate_ctx(ctx); | ||
1077 | |||
1078 | if (rotate_percpu) | ||
1079 | perf_counter_cpu_sched_in(cpuctx, cpu); | ||
1080 | perf_counter_task_sched_in(curr, cpu); | ||
1081 | } | ||
1082 | |||
1083 | /* | ||
1084 | * Cross CPU call to read the hardware counter | ||
1085 | */ | ||
1086 | static void __read(void *info) | ||
1087 | { | ||
1088 | struct perf_counter *counter = info; | ||
1089 | struct perf_counter_context *ctx = counter->ctx; | ||
1090 | unsigned long flags; | ||
1091 | |||
1092 | local_irq_save(flags); | ||
1093 | if (ctx->is_active) | ||
1094 | update_context_time(ctx); | ||
1095 | counter->hw_ops->read(counter); | ||
1096 | update_counter_times(counter); | ||
1097 | local_irq_restore(flags); | ||
1098 | } | ||
1099 | |||
1100 | static u64 perf_counter_read(struct perf_counter *counter) | ||
1101 | { | ||
1102 | /* | ||
1103 | * If counter is enabled and currently active on a CPU, update the | ||
1104 | * value in the counter structure: | ||
1105 | */ | ||
1106 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
1107 | smp_call_function_single(counter->oncpu, | ||
1108 | __read, counter, 1); | ||
1109 | } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { | ||
1110 | update_counter_times(counter); | ||
1111 | } | ||
1112 | |||
1113 | return atomic64_read(&counter->count); | ||
1114 | } | ||
1115 | |||
1116 | static void put_context(struct perf_counter_context *ctx) | ||
1117 | { | ||
1118 | if (ctx->task) | ||
1119 | put_task_struct(ctx->task); | ||
1120 | } | ||
1121 | |||
1122 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | ||
1123 | { | ||
1124 | struct perf_cpu_context *cpuctx; | ||
1125 | struct perf_counter_context *ctx; | ||
1126 | struct task_struct *task; | ||
1127 | |||
1128 | /* | ||
1129 | * If cpu is not a wildcard then this is a percpu counter: | ||
1130 | */ | ||
1131 | if (cpu != -1) { | ||
1132 | /* Must be root to operate on a CPU counter: */ | ||
1133 | if (!capable(CAP_SYS_ADMIN)) | ||
1134 | return ERR_PTR(-EACCES); | ||
1135 | |||
1136 | if (cpu < 0 || cpu > num_possible_cpus()) | ||
1137 | return ERR_PTR(-EINVAL); | ||
1138 | |||
1139 | /* | ||
1140 | * We could be clever and allow to attach a counter to an | ||
1141 | * offline CPU and activate it when the CPU comes up, but | ||
1142 | * that's for later. | ||
1143 | */ | ||
1144 | if (!cpu_isset(cpu, cpu_online_map)) | ||
1145 | return ERR_PTR(-ENODEV); | ||
1146 | |||
1147 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
1148 | ctx = &cpuctx->ctx; | ||
1149 | |||
1150 | return ctx; | ||
1151 | } | ||
1152 | |||
1153 | rcu_read_lock(); | ||
1154 | if (!pid) | ||
1155 | task = current; | ||
1156 | else | ||
1157 | task = find_task_by_vpid(pid); | ||
1158 | if (task) | ||
1159 | get_task_struct(task); | ||
1160 | rcu_read_unlock(); | ||
1161 | |||
1162 | if (!task) | ||
1163 | return ERR_PTR(-ESRCH); | ||
1164 | |||
1165 | ctx = &task->perf_counter_ctx; | ||
1166 | ctx->task = task; | ||
1167 | |||
1168 | /* Reuse ptrace permission checks for now. */ | ||
1169 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | ||
1170 | put_context(ctx); | ||
1171 | return ERR_PTR(-EACCES); | ||
1172 | } | ||
1173 | |||
1174 | return ctx; | ||
1175 | } | ||
1176 | |||
1177 | static void free_counter_rcu(struct rcu_head *head) | ||
1178 | { | ||
1179 | struct perf_counter *counter; | ||
1180 | |||
1181 | counter = container_of(head, struct perf_counter, rcu_head); | ||
1182 | kfree(counter); | ||
1183 | } | ||
1184 | |||
1185 | static void perf_pending_sync(struct perf_counter *counter); | ||
1186 | |||
1187 | static void free_counter(struct perf_counter *counter) | ||
1188 | { | ||
1189 | perf_pending_sync(counter); | ||
1190 | |||
1191 | if (counter->destroy) | ||
1192 | counter->destroy(counter); | ||
1193 | |||
1194 | call_rcu(&counter->rcu_head, free_counter_rcu); | ||
1195 | } | ||
1196 | |||
1197 | /* | ||
1198 | * Called when the last reference to the file is gone. | ||
1199 | */ | ||
1200 | static int perf_release(struct inode *inode, struct file *file) | ||
1201 | { | ||
1202 | struct perf_counter *counter = file->private_data; | ||
1203 | struct perf_counter_context *ctx = counter->ctx; | ||
1204 | |||
1205 | file->private_data = NULL; | ||
1206 | |||
1207 | mutex_lock(&ctx->mutex); | ||
1208 | mutex_lock(&counter->mutex); | ||
1209 | |||
1210 | perf_counter_remove_from_context(counter); | ||
1211 | |||
1212 | mutex_unlock(&counter->mutex); | ||
1213 | mutex_unlock(&ctx->mutex); | ||
1214 | |||
1215 | free_counter(counter); | ||
1216 | put_context(ctx); | ||
1217 | |||
1218 | return 0; | ||
1219 | } | ||
1220 | |||
1221 | /* | ||
1222 | * Read the performance counter - simple non blocking version for now | ||
1223 | */ | ||
1224 | static ssize_t | ||
1225 | perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | ||
1226 | { | ||
1227 | u64 values[3]; | ||
1228 | int n; | ||
1229 | |||
1230 | /* | ||
1231 | * Return end-of-file for a read on a counter that is in | ||
1232 | * error state (i.e. because it was pinned but it couldn't be | ||
1233 | * scheduled on to the CPU at some point). | ||
1234 | */ | ||
1235 | if (counter->state == PERF_COUNTER_STATE_ERROR) | ||
1236 | return 0; | ||
1237 | |||
1238 | mutex_lock(&counter->mutex); | ||
1239 | values[0] = perf_counter_read(counter); | ||
1240 | n = 1; | ||
1241 | if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
1242 | values[n++] = counter->total_time_enabled + | ||
1243 | atomic64_read(&counter->child_total_time_enabled); | ||
1244 | if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
1245 | values[n++] = counter->total_time_running + | ||
1246 | atomic64_read(&counter->child_total_time_running); | ||
1247 | mutex_unlock(&counter->mutex); | ||
1248 | |||
1249 | if (count < n * sizeof(u64)) | ||
1250 | return -EINVAL; | ||
1251 | count = n * sizeof(u64); | ||
1252 | |||
1253 | if (copy_to_user(buf, values, count)) | ||
1254 | return -EFAULT; | ||
1255 | |||
1256 | return count; | ||
1257 | } | ||
1258 | |||
1259 | static ssize_t | ||
1260 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
1261 | { | ||
1262 | struct perf_counter *counter = file->private_data; | ||
1263 | |||
1264 | return perf_read_hw(counter, buf, count); | ||
1265 | } | ||
1266 | |||
1267 | static unsigned int perf_poll(struct file *file, poll_table *wait) | ||
1268 | { | ||
1269 | struct perf_counter *counter = file->private_data; | ||
1270 | struct perf_mmap_data *data; | ||
1271 | unsigned int events; | ||
1272 | |||
1273 | rcu_read_lock(); | ||
1274 | data = rcu_dereference(counter->data); | ||
1275 | if (data) | ||
1276 | events = atomic_xchg(&data->wakeup, 0); | ||
1277 | else | ||
1278 | events = POLL_HUP; | ||
1279 | rcu_read_unlock(); | ||
1280 | |||
1281 | poll_wait(file, &counter->waitq, wait); | ||
1282 | |||
1283 | return events; | ||
1284 | } | ||
1285 | |||
1286 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
1287 | { | ||
1288 | struct perf_counter *counter = file->private_data; | ||
1289 | int err = 0; | ||
1290 | |||
1291 | switch (cmd) { | ||
1292 | case PERF_COUNTER_IOC_ENABLE: | ||
1293 | perf_counter_enable_family(counter); | ||
1294 | break; | ||
1295 | case PERF_COUNTER_IOC_DISABLE: | ||
1296 | perf_counter_disable_family(counter); | ||
1297 | break; | ||
1298 | case PERF_COUNTER_IOC_REFRESH: | ||
1299 | perf_counter_refresh(counter, arg); | ||
1300 | break; | ||
1301 | default: | ||
1302 | err = -ENOTTY; | ||
1303 | } | ||
1304 | return err; | ||
1305 | } | ||
1306 | |||
1307 | /* | ||
1308 | * Callers need to ensure there can be no nesting of this function, otherwise | ||
1309 | * the seqlock logic goes bad. We can not serialize this because the arch | ||
1310 | * code calls this from NMI context. | ||
1311 | */ | ||
1312 | void perf_counter_update_userpage(struct perf_counter *counter) | ||
1313 | { | ||
1314 | struct perf_mmap_data *data; | ||
1315 | struct perf_counter_mmap_page *userpg; | ||
1316 | |||
1317 | rcu_read_lock(); | ||
1318 | data = rcu_dereference(counter->data); | ||
1319 | if (!data) | ||
1320 | goto unlock; | ||
1321 | |||
1322 | userpg = data->user_page; | ||
1323 | |||
1324 | /* | ||
1325 | * Disable preemption so as to not let the corresponding user-space | ||
1326 | * spin too long if we get preempted. | ||
1327 | */ | ||
1328 | preempt_disable(); | ||
1329 | ++userpg->lock; | ||
1330 | barrier(); | ||
1331 | userpg->index = counter->hw.idx; | ||
1332 | userpg->offset = atomic64_read(&counter->count); | ||
1333 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) | ||
1334 | userpg->offset -= atomic64_read(&counter->hw.prev_count); | ||
1335 | |||
1336 | barrier(); | ||
1337 | ++userpg->lock; | ||
1338 | preempt_enable(); | ||
1339 | unlock: | ||
1340 | rcu_read_unlock(); | ||
1341 | } | ||
1342 | |||
1343 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
1344 | { | ||
1345 | struct perf_counter *counter = vma->vm_file->private_data; | ||
1346 | struct perf_mmap_data *data; | ||
1347 | int ret = VM_FAULT_SIGBUS; | ||
1348 | |||
1349 | rcu_read_lock(); | ||
1350 | data = rcu_dereference(counter->data); | ||
1351 | if (!data) | ||
1352 | goto unlock; | ||
1353 | |||
1354 | if (vmf->pgoff == 0) { | ||
1355 | vmf->page = virt_to_page(data->user_page); | ||
1356 | } else { | ||
1357 | int nr = vmf->pgoff - 1; | ||
1358 | |||
1359 | if ((unsigned)nr > data->nr_pages) | ||
1360 | goto unlock; | ||
1361 | |||
1362 | vmf->page = virt_to_page(data->data_pages[nr]); | ||
1363 | } | ||
1364 | get_page(vmf->page); | ||
1365 | ret = 0; | ||
1366 | unlock: | ||
1367 | rcu_read_unlock(); | ||
1368 | |||
1369 | return ret; | ||
1370 | } | ||
1371 | |||
1372 | static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) | ||
1373 | { | ||
1374 | struct perf_mmap_data *data; | ||
1375 | unsigned long size; | ||
1376 | int i; | ||
1377 | |||
1378 | WARN_ON(atomic_read(&counter->mmap_count)); | ||
1379 | |||
1380 | size = sizeof(struct perf_mmap_data); | ||
1381 | size += nr_pages * sizeof(void *); | ||
1382 | |||
1383 | data = kzalloc(size, GFP_KERNEL); | ||
1384 | if (!data) | ||
1385 | goto fail; | ||
1386 | |||
1387 | data->user_page = (void *)get_zeroed_page(GFP_KERNEL); | ||
1388 | if (!data->user_page) | ||
1389 | goto fail_user_page; | ||
1390 | |||
1391 | for (i = 0; i < nr_pages; i++) { | ||
1392 | data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); | ||
1393 | if (!data->data_pages[i]) | ||
1394 | goto fail_data_pages; | ||
1395 | } | ||
1396 | |||
1397 | data->nr_pages = nr_pages; | ||
1398 | |||
1399 | rcu_assign_pointer(counter->data, data); | ||
1400 | |||
1401 | return 0; | ||
1402 | |||
1403 | fail_data_pages: | ||
1404 | for (i--; i >= 0; i--) | ||
1405 | free_page((unsigned long)data->data_pages[i]); | ||
1406 | |||
1407 | free_page((unsigned long)data->user_page); | ||
1408 | |||
1409 | fail_user_page: | ||
1410 | kfree(data); | ||
1411 | |||
1412 | fail: | ||
1413 | return -ENOMEM; | ||
1414 | } | ||
1415 | |||
1416 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) | ||
1417 | { | ||
1418 | struct perf_mmap_data *data = container_of(rcu_head, | ||
1419 | struct perf_mmap_data, rcu_head); | ||
1420 | int i; | ||
1421 | |||
1422 | free_page((unsigned long)data->user_page); | ||
1423 | for (i = 0; i < data->nr_pages; i++) | ||
1424 | free_page((unsigned long)data->data_pages[i]); | ||
1425 | kfree(data); | ||
1426 | } | ||
1427 | |||
1428 | static void perf_mmap_data_free(struct perf_counter *counter) | ||
1429 | { | ||
1430 | struct perf_mmap_data *data = counter->data; | ||
1431 | |||
1432 | WARN_ON(atomic_read(&counter->mmap_count)); | ||
1433 | |||
1434 | rcu_assign_pointer(counter->data, NULL); | ||
1435 | call_rcu(&data->rcu_head, __perf_mmap_data_free); | ||
1436 | } | ||
1437 | |||
1438 | static void perf_mmap_open(struct vm_area_struct *vma) | ||
1439 | { | ||
1440 | struct perf_counter *counter = vma->vm_file->private_data; | ||
1441 | |||
1442 | atomic_inc(&counter->mmap_count); | ||
1443 | } | ||
1444 | |||
1445 | static void perf_mmap_close(struct vm_area_struct *vma) | ||
1446 | { | ||
1447 | struct perf_counter *counter = vma->vm_file->private_data; | ||
1448 | |||
1449 | if (atomic_dec_and_mutex_lock(&counter->mmap_count, | ||
1450 | &counter->mmap_mutex)) { | ||
1451 | vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; | ||
1452 | perf_mmap_data_free(counter); | ||
1453 | mutex_unlock(&counter->mmap_mutex); | ||
1454 | } | ||
1455 | } | ||
1456 | |||
1457 | static struct vm_operations_struct perf_mmap_vmops = { | ||
1458 | .open = perf_mmap_open, | ||
1459 | .close = perf_mmap_close, | ||
1460 | .fault = perf_mmap_fault, | ||
1461 | }; | ||
1462 | |||
1463 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) | ||
1464 | { | ||
1465 | struct perf_counter *counter = file->private_data; | ||
1466 | unsigned long vma_size; | ||
1467 | unsigned long nr_pages; | ||
1468 | unsigned long locked, lock_limit; | ||
1469 | int ret = 0; | ||
1470 | |||
1471 | if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) | ||
1472 | return -EINVAL; | ||
1473 | |||
1474 | vma_size = vma->vm_end - vma->vm_start; | ||
1475 | nr_pages = (vma_size / PAGE_SIZE) - 1; | ||
1476 | |||
1477 | /* | ||
1478 | * If we have data pages ensure they're a power-of-two number, so we | ||
1479 | * can do bitmasks instead of modulo. | ||
1480 | */ | ||
1481 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | ||
1482 | return -EINVAL; | ||
1483 | |||
1484 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) | ||
1485 | return -EINVAL; | ||
1486 | |||
1487 | if (vma->vm_pgoff != 0) | ||
1488 | return -EINVAL; | ||
1489 | |||
1490 | mutex_lock(&counter->mmap_mutex); | ||
1491 | if (atomic_inc_not_zero(&counter->mmap_count)) { | ||
1492 | if (nr_pages != counter->data->nr_pages) | ||
1493 | ret = -EINVAL; | ||
1494 | goto unlock; | ||
1495 | } | ||
1496 | |||
1497 | locked = vma->vm_mm->locked_vm; | ||
1498 | locked += nr_pages + 1; | ||
1499 | |||
1500 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
1501 | lock_limit >>= PAGE_SHIFT; | ||
1502 | |||
1503 | if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { | ||
1504 | ret = -EPERM; | ||
1505 | goto unlock; | ||
1506 | } | ||
1507 | |||
1508 | WARN_ON(counter->data); | ||
1509 | ret = perf_mmap_data_alloc(counter, nr_pages); | ||
1510 | if (ret) | ||
1511 | goto unlock; | ||
1512 | |||
1513 | atomic_set(&counter->mmap_count, 1); | ||
1514 | vma->vm_mm->locked_vm += nr_pages + 1; | ||
1515 | unlock: | ||
1516 | mutex_unlock(&counter->mmap_mutex); | ||
1517 | |||
1518 | vma->vm_flags &= ~VM_MAYWRITE; | ||
1519 | vma->vm_flags |= VM_RESERVED; | ||
1520 | vma->vm_ops = &perf_mmap_vmops; | ||
1521 | |||
1522 | return ret; | ||
1523 | } | ||
1524 | |||
1525 | static int perf_fasync(int fd, struct file *filp, int on) | ||
1526 | { | ||
1527 | struct perf_counter *counter = filp->private_data; | ||
1528 | struct inode *inode = filp->f_path.dentry->d_inode; | ||
1529 | int retval; | ||
1530 | |||
1531 | mutex_lock(&inode->i_mutex); | ||
1532 | retval = fasync_helper(fd, filp, on, &counter->fasync); | ||
1533 | mutex_unlock(&inode->i_mutex); | ||
1534 | |||
1535 | if (retval < 0) | ||
1536 | return retval; | ||
1537 | |||
1538 | return 0; | ||
1539 | } | ||
1540 | |||
1541 | static const struct file_operations perf_fops = { | ||
1542 | .release = perf_release, | ||
1543 | .read = perf_read, | ||
1544 | .poll = perf_poll, | ||
1545 | .unlocked_ioctl = perf_ioctl, | ||
1546 | .compat_ioctl = perf_ioctl, | ||
1547 | .mmap = perf_mmap, | ||
1548 | .fasync = perf_fasync, | ||
1549 | }; | ||
1550 | |||
1551 | /* | ||
1552 | * Perf counter wakeup | ||
1553 | * | ||
1554 | * If there's data, ensure we set the poll() state and publish everything | ||
1555 | * to user-space before waking everybody up. | ||
1556 | */ | ||
1557 | |||
1558 | void perf_counter_wakeup(struct perf_counter *counter) | ||
1559 | { | ||
1560 | struct perf_mmap_data *data; | ||
1561 | |||
1562 | rcu_read_lock(); | ||
1563 | data = rcu_dereference(counter->data); | ||
1564 | if (data) { | ||
1565 | atomic_set(&data->wakeup, POLL_IN); | ||
1566 | /* | ||
1567 | * Ensure all data writes are issued before updating the | ||
1568 | * user-space data head information. The matching rmb() | ||
1569 | * will be in userspace after reading this value. | ||
1570 | */ | ||
1571 | smp_wmb(); | ||
1572 | data->user_page->data_head = atomic_read(&data->head); | ||
1573 | } | ||
1574 | rcu_read_unlock(); | ||
1575 | |||
1576 | wake_up_all(&counter->waitq); | ||
1577 | |||
1578 | if (counter->pending_kill) { | ||
1579 | kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); | ||
1580 | counter->pending_kill = 0; | ||
1581 | } | ||
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * Pending wakeups | ||
1586 | * | ||
1587 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
1588 | * | ||
1589 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
1590 | * single linked list and use cmpxchg() to add entries lockless. | ||
1591 | */ | ||
1592 | |||
1593 | static void perf_pending_counter(struct perf_pending_entry *entry) | ||
1594 | { | ||
1595 | struct perf_counter *counter = container_of(entry, | ||
1596 | struct perf_counter, pending); | ||
1597 | |||
1598 | if (counter->pending_disable) { | ||
1599 | counter->pending_disable = 0; | ||
1600 | perf_counter_disable(counter); | ||
1601 | } | ||
1602 | |||
1603 | if (counter->pending_wakeup) { | ||
1604 | counter->pending_wakeup = 0; | ||
1605 | perf_counter_wakeup(counter); | ||
1606 | } | ||
1607 | } | ||
1608 | |||
1609 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
1610 | |||
1611 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
1612 | PENDING_TAIL, | ||
1613 | }; | ||
1614 | |||
1615 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
1616 | void (*func)(struct perf_pending_entry *)) | ||
1617 | { | ||
1618 | struct perf_pending_entry **head; | ||
1619 | |||
1620 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
1621 | return; | ||
1622 | |||
1623 | entry->func = func; | ||
1624 | |||
1625 | head = &get_cpu_var(perf_pending_head); | ||
1626 | |||
1627 | do { | ||
1628 | entry->next = *head; | ||
1629 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
1630 | |||
1631 | set_perf_counter_pending(); | ||
1632 | |||
1633 | put_cpu_var(perf_pending_head); | ||
1634 | } | ||
1635 | |||
1636 | static int __perf_pending_run(void) | ||
1637 | { | ||
1638 | struct perf_pending_entry *list; | ||
1639 | int nr = 0; | ||
1640 | |||
1641 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
1642 | while (list != PENDING_TAIL) { | ||
1643 | void (*func)(struct perf_pending_entry *); | ||
1644 | struct perf_pending_entry *entry = list; | ||
1645 | |||
1646 | list = list->next; | ||
1647 | |||
1648 | func = entry->func; | ||
1649 | entry->next = NULL; | ||
1650 | /* | ||
1651 | * Ensure we observe the unqueue before we issue the wakeup, | ||
1652 | * so that we won't be waiting forever. | ||
1653 | * -- see perf_not_pending(). | ||
1654 | */ | ||
1655 | smp_wmb(); | ||
1656 | |||
1657 | func(entry); | ||
1658 | nr++; | ||
1659 | } | ||
1660 | |||
1661 | return nr; | ||
1662 | } | ||
1663 | |||
1664 | static inline int perf_not_pending(struct perf_counter *counter) | ||
1665 | { | ||
1666 | /* | ||
1667 | * If we flush on whatever cpu we run, there is a chance we don't | ||
1668 | * need to wait. | ||
1669 | */ | ||
1670 | get_cpu(); | ||
1671 | __perf_pending_run(); | ||
1672 | put_cpu(); | ||
1673 | |||
1674 | /* | ||
1675 | * Ensure we see the proper queue state before going to sleep | ||
1676 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
1677 | */ | ||
1678 | smp_rmb(); | ||
1679 | return counter->pending.next == NULL; | ||
1680 | } | ||
1681 | |||
1682 | static void perf_pending_sync(struct perf_counter *counter) | ||
1683 | { | ||
1684 | wait_event(counter->waitq, perf_not_pending(counter)); | ||
1685 | } | ||
1686 | |||
1687 | void perf_counter_do_pending(void) | ||
1688 | { | ||
1689 | __perf_pending_run(); | ||
1690 | } | ||
1691 | |||
1692 | /* | ||
1693 | * Callchain support -- arch specific | ||
1694 | */ | ||
1695 | |||
1696 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1697 | { | ||
1698 | return NULL; | ||
1699 | } | ||
1700 | |||
1701 | /* | ||
1702 | * Output | ||
1703 | */ | ||
1704 | |||
1705 | struct perf_output_handle { | ||
1706 | struct perf_counter *counter; | ||
1707 | struct perf_mmap_data *data; | ||
1708 | unsigned int offset; | ||
1709 | unsigned int head; | ||
1710 | int wakeup; | ||
1711 | int nmi; | ||
1712 | int overflow; | ||
1713 | }; | ||
1714 | |||
1715 | static inline void __perf_output_wakeup(struct perf_output_handle *handle) | ||
1716 | { | ||
1717 | if (handle->nmi) { | ||
1718 | handle->counter->pending_wakeup = 1; | ||
1719 | perf_pending_queue(&handle->counter->pending, | ||
1720 | perf_pending_counter); | ||
1721 | } else | ||
1722 | perf_counter_wakeup(handle->counter); | ||
1723 | } | ||
1724 | |||
1725 | static int perf_output_begin(struct perf_output_handle *handle, | ||
1726 | struct perf_counter *counter, unsigned int size, | ||
1727 | int nmi, int overflow) | ||
1728 | { | ||
1729 | struct perf_mmap_data *data; | ||
1730 | unsigned int offset, head; | ||
1731 | |||
1732 | rcu_read_lock(); | ||
1733 | data = rcu_dereference(counter->data); | ||
1734 | if (!data) | ||
1735 | goto out; | ||
1736 | |||
1737 | handle->counter = counter; | ||
1738 | handle->nmi = nmi; | ||
1739 | handle->overflow = overflow; | ||
1740 | |||
1741 | if (!data->nr_pages) | ||
1742 | goto fail; | ||
1743 | |||
1744 | do { | ||
1745 | offset = head = atomic_read(&data->head); | ||
1746 | head += size; | ||
1747 | } while (atomic_cmpxchg(&data->head, offset, head) != offset); | ||
1748 | |||
1749 | handle->data = data; | ||
1750 | handle->offset = offset; | ||
1751 | handle->head = head; | ||
1752 | handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); | ||
1753 | |||
1754 | return 0; | ||
1755 | |||
1756 | fail: | ||
1757 | __perf_output_wakeup(handle); | ||
1758 | out: | ||
1759 | rcu_read_unlock(); | ||
1760 | |||
1761 | return -ENOSPC; | ||
1762 | } | ||
1763 | |||
1764 | static void perf_output_copy(struct perf_output_handle *handle, | ||
1765 | void *buf, unsigned int len) | ||
1766 | { | ||
1767 | unsigned int pages_mask; | ||
1768 | unsigned int offset; | ||
1769 | unsigned int size; | ||
1770 | void **pages; | ||
1771 | |||
1772 | offset = handle->offset; | ||
1773 | pages_mask = handle->data->nr_pages - 1; | ||
1774 | pages = handle->data->data_pages; | ||
1775 | |||
1776 | do { | ||
1777 | unsigned int page_offset; | ||
1778 | int nr; | ||
1779 | |||
1780 | nr = (offset >> PAGE_SHIFT) & pages_mask; | ||
1781 | page_offset = offset & (PAGE_SIZE - 1); | ||
1782 | size = min_t(unsigned int, PAGE_SIZE - page_offset, len); | ||
1783 | |||
1784 | memcpy(pages[nr] + page_offset, buf, size); | ||
1785 | |||
1786 | len -= size; | ||
1787 | buf += size; | ||
1788 | offset += size; | ||
1789 | } while (len); | ||
1790 | |||
1791 | handle->offset = offset; | ||
1792 | |||
1793 | WARN_ON_ONCE(handle->offset > handle->head); | ||
1794 | } | ||
1795 | |||
1796 | #define perf_output_put(handle, x) \ | ||
1797 | perf_output_copy((handle), &(x), sizeof(x)) | ||
1798 | |||
1799 | static void perf_output_end(struct perf_output_handle *handle) | ||
1800 | { | ||
1801 | int wakeup_events = handle->counter->hw_event.wakeup_events; | ||
1802 | |||
1803 | if (handle->overflow && wakeup_events) { | ||
1804 | int events = atomic_inc_return(&handle->data->events); | ||
1805 | if (events >= wakeup_events) { | ||
1806 | atomic_sub(wakeup_events, &handle->data->events); | ||
1807 | __perf_output_wakeup(handle); | ||
1808 | } | ||
1809 | } else if (handle->wakeup) | ||
1810 | __perf_output_wakeup(handle); | ||
1811 | rcu_read_unlock(); | ||
1812 | } | ||
1813 | |||
1814 | static void perf_counter_output(struct perf_counter *counter, | ||
1815 | int nmi, struct pt_regs *regs) | ||
1816 | { | ||
1817 | int ret; | ||
1818 | u64 record_type = counter->hw_event.record_type; | ||
1819 | struct perf_output_handle handle; | ||
1820 | struct perf_event_header header; | ||
1821 | u64 ip; | ||
1822 | struct { | ||
1823 | u32 pid, tid; | ||
1824 | } tid_entry; | ||
1825 | struct { | ||
1826 | u64 event; | ||
1827 | u64 counter; | ||
1828 | } group_entry; | ||
1829 | struct perf_callchain_entry *callchain = NULL; | ||
1830 | int callchain_size = 0; | ||
1831 | u64 time; | ||
1832 | |||
1833 | header.type = PERF_EVENT_COUNTER_OVERFLOW; | ||
1834 | header.size = sizeof(header); | ||
1835 | |||
1836 | if (record_type & PERF_RECORD_IP) { | ||
1837 | ip = instruction_pointer(regs); | ||
1838 | header.type |= __PERF_EVENT_IP; | ||
1839 | header.size += sizeof(ip); | ||
1840 | } | ||
1841 | |||
1842 | if (record_type & PERF_RECORD_TID) { | ||
1843 | /* namespace issues */ | ||
1844 | tid_entry.pid = current->group_leader->pid; | ||
1845 | tid_entry.tid = current->pid; | ||
1846 | |||
1847 | header.type |= __PERF_EVENT_TID; | ||
1848 | header.size += sizeof(tid_entry); | ||
1849 | } | ||
1850 | |||
1851 | if (record_type & PERF_RECORD_GROUP) { | ||
1852 | header.type |= __PERF_EVENT_GROUP; | ||
1853 | header.size += sizeof(u64) + | ||
1854 | counter->nr_siblings * sizeof(group_entry); | ||
1855 | } | ||
1856 | |||
1857 | if (record_type & PERF_RECORD_CALLCHAIN) { | ||
1858 | callchain = perf_callchain(regs); | ||
1859 | |||
1860 | if (callchain) { | ||
1861 | callchain_size = (1 + callchain->nr) * sizeof(u64); | ||
1862 | |||
1863 | header.type |= __PERF_EVENT_CALLCHAIN; | ||
1864 | header.size += callchain_size; | ||
1865 | } | ||
1866 | } | ||
1867 | |||
1868 | if (record_type & PERF_RECORD_TIME) { | ||
1869 | /* | ||
1870 | * Maybe do better on x86 and provide cpu_clock_nmi() | ||
1871 | */ | ||
1872 | time = sched_clock(); | ||
1873 | |||
1874 | header.type |= __PERF_EVENT_TIME; | ||
1875 | header.size += sizeof(u64); | ||
1876 | } | ||
1877 | |||
1878 | ret = perf_output_begin(&handle, counter, header.size, nmi, 1); | ||
1879 | if (ret) | ||
1880 | return; | ||
1881 | |||
1882 | perf_output_put(&handle, header); | ||
1883 | |||
1884 | if (record_type & PERF_RECORD_IP) | ||
1885 | perf_output_put(&handle, ip); | ||
1886 | |||
1887 | if (record_type & PERF_RECORD_TID) | ||
1888 | perf_output_put(&handle, tid_entry); | ||
1889 | |||
1890 | if (record_type & PERF_RECORD_GROUP) { | ||
1891 | struct perf_counter *leader, *sub; | ||
1892 | u64 nr = counter->nr_siblings; | ||
1893 | |||
1894 | perf_output_put(&handle, nr); | ||
1895 | |||
1896 | leader = counter->group_leader; | ||
1897 | list_for_each_entry(sub, &leader->sibling_list, list_entry) { | ||
1898 | if (sub != counter) | ||
1899 | sub->hw_ops->read(sub); | ||
1900 | |||
1901 | group_entry.event = sub->hw_event.config; | ||
1902 | group_entry.counter = atomic64_read(&sub->count); | ||
1903 | |||
1904 | perf_output_put(&handle, group_entry); | ||
1905 | } | ||
1906 | } | ||
1907 | |||
1908 | if (callchain) | ||
1909 | perf_output_copy(&handle, callchain, callchain_size); | ||
1910 | |||
1911 | if (record_type & PERF_RECORD_TIME) | ||
1912 | perf_output_put(&handle, time); | ||
1913 | |||
1914 | perf_output_end(&handle); | ||
1915 | } | ||
1916 | |||
1917 | /* | ||
1918 | * mmap tracking | ||
1919 | */ | ||
1920 | |||
1921 | struct perf_mmap_event { | ||
1922 | struct file *file; | ||
1923 | char *file_name; | ||
1924 | int file_size; | ||
1925 | |||
1926 | struct { | ||
1927 | struct perf_event_header header; | ||
1928 | |||
1929 | u32 pid; | ||
1930 | u32 tid; | ||
1931 | u64 start; | ||
1932 | u64 len; | ||
1933 | u64 pgoff; | ||
1934 | } event; | ||
1935 | }; | ||
1936 | |||
1937 | static void perf_counter_mmap_output(struct perf_counter *counter, | ||
1938 | struct perf_mmap_event *mmap_event) | ||
1939 | { | ||
1940 | struct perf_output_handle handle; | ||
1941 | int size = mmap_event->event.header.size; | ||
1942 | int ret = perf_output_begin(&handle, counter, size, 0, 0); | ||
1943 | |||
1944 | if (ret) | ||
1945 | return; | ||
1946 | |||
1947 | perf_output_put(&handle, mmap_event->event); | ||
1948 | perf_output_copy(&handle, mmap_event->file_name, | ||
1949 | mmap_event->file_size); | ||
1950 | perf_output_end(&handle); | ||
1951 | } | ||
1952 | |||
1953 | static int perf_counter_mmap_match(struct perf_counter *counter, | ||
1954 | struct perf_mmap_event *mmap_event) | ||
1955 | { | ||
1956 | if (counter->hw_event.mmap && | ||
1957 | mmap_event->event.header.type == PERF_EVENT_MMAP) | ||
1958 | return 1; | ||
1959 | |||
1960 | if (counter->hw_event.munmap && | ||
1961 | mmap_event->event.header.type == PERF_EVENT_MUNMAP) | ||
1962 | return 1; | ||
1963 | |||
1964 | return 0; | ||
1965 | } | ||
1966 | |||
1967 | static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, | ||
1968 | struct perf_mmap_event *mmap_event) | ||
1969 | { | ||
1970 | struct perf_counter *counter; | ||
1971 | |||
1972 | if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) | ||
1973 | return; | ||
1974 | |||
1975 | rcu_read_lock(); | ||
1976 | list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { | ||
1977 | if (perf_counter_mmap_match(counter, mmap_event)) | ||
1978 | perf_counter_mmap_output(counter, mmap_event); | ||
1979 | } | ||
1980 | rcu_read_unlock(); | ||
1981 | } | ||
1982 | |||
1983 | static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) | ||
1984 | { | ||
1985 | struct perf_cpu_context *cpuctx; | ||
1986 | struct file *file = mmap_event->file; | ||
1987 | unsigned int size; | ||
1988 | char tmp[16]; | ||
1989 | char *buf = NULL; | ||
1990 | char *name; | ||
1991 | |||
1992 | if (file) { | ||
1993 | buf = kzalloc(PATH_MAX, GFP_KERNEL); | ||
1994 | if (!buf) { | ||
1995 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
1996 | goto got_name; | ||
1997 | } | ||
1998 | name = dentry_path(file->f_dentry, buf, PATH_MAX); | ||
1999 | if (IS_ERR(name)) { | ||
2000 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | ||
2001 | goto got_name; | ||
2002 | } | ||
2003 | } else { | ||
2004 | name = strncpy(tmp, "//anon", sizeof(tmp)); | ||
2005 | goto got_name; | ||
2006 | } | ||
2007 | |||
2008 | got_name: | ||
2009 | size = ALIGN(strlen(name), sizeof(u64)); | ||
2010 | |||
2011 | mmap_event->file_name = name; | ||
2012 | mmap_event->file_size = size; | ||
2013 | |||
2014 | mmap_event->event.header.size = sizeof(mmap_event->event) + size; | ||
2015 | |||
2016 | cpuctx = &get_cpu_var(perf_cpu_context); | ||
2017 | perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); | ||
2018 | put_cpu_var(perf_cpu_context); | ||
2019 | |||
2020 | perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); | ||
2021 | |||
2022 | kfree(buf); | ||
2023 | } | ||
2024 | |||
2025 | void perf_counter_mmap(unsigned long addr, unsigned long len, | ||
2026 | unsigned long pgoff, struct file *file) | ||
2027 | { | ||
2028 | struct perf_mmap_event mmap_event = { | ||
2029 | .file = file, | ||
2030 | .event = { | ||
2031 | .header = { .type = PERF_EVENT_MMAP, }, | ||
2032 | .pid = current->group_leader->pid, | ||
2033 | .tid = current->pid, | ||
2034 | .start = addr, | ||
2035 | .len = len, | ||
2036 | .pgoff = pgoff, | ||
2037 | }, | ||
2038 | }; | ||
2039 | |||
2040 | perf_counter_mmap_event(&mmap_event); | ||
2041 | } | ||
2042 | |||
2043 | void perf_counter_munmap(unsigned long addr, unsigned long len, | ||
2044 | unsigned long pgoff, struct file *file) | ||
2045 | { | ||
2046 | struct perf_mmap_event mmap_event = { | ||
2047 | .file = file, | ||
2048 | .event = { | ||
2049 | .header = { .type = PERF_EVENT_MUNMAP, }, | ||
2050 | .pid = current->group_leader->pid, | ||
2051 | .tid = current->pid, | ||
2052 | .start = addr, | ||
2053 | .len = len, | ||
2054 | .pgoff = pgoff, | ||
2055 | }, | ||
2056 | }; | ||
2057 | |||
2058 | perf_counter_mmap_event(&mmap_event); | ||
2059 | } | ||
2060 | |||
2061 | /* | ||
2062 | * Generic counter overflow handling. | ||
2063 | */ | ||
2064 | |||
2065 | int perf_counter_overflow(struct perf_counter *counter, | ||
2066 | int nmi, struct pt_regs *regs) | ||
2067 | { | ||
2068 | int events = atomic_read(&counter->event_limit); | ||
2069 | int ret = 0; | ||
2070 | |||
2071 | counter->pending_kill = POLL_IN; | ||
2072 | if (events && atomic_dec_and_test(&counter->event_limit)) { | ||
2073 | ret = 1; | ||
2074 | counter->pending_kill = POLL_HUP; | ||
2075 | if (nmi) { | ||
2076 | counter->pending_disable = 1; | ||
2077 | perf_pending_queue(&counter->pending, | ||
2078 | perf_pending_counter); | ||
2079 | } else | ||
2080 | perf_counter_disable(counter); | ||
2081 | } | ||
2082 | |||
2083 | perf_counter_output(counter, nmi, regs); | ||
2084 | return ret; | ||
2085 | } | ||
2086 | |||
2087 | /* | ||
2088 | * Generic software counter infrastructure | ||
2089 | */ | ||
2090 | |||
2091 | static void perf_swcounter_update(struct perf_counter *counter) | ||
2092 | { | ||
2093 | struct hw_perf_counter *hwc = &counter->hw; | ||
2094 | u64 prev, now; | ||
2095 | s64 delta; | ||
2096 | |||
2097 | again: | ||
2098 | prev = atomic64_read(&hwc->prev_count); | ||
2099 | now = atomic64_read(&hwc->count); | ||
2100 | if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) | ||
2101 | goto again; | ||
2102 | |||
2103 | delta = now - prev; | ||
2104 | |||
2105 | atomic64_add(delta, &counter->count); | ||
2106 | atomic64_sub(delta, &hwc->period_left); | ||
2107 | } | ||
2108 | |||
2109 | static void perf_swcounter_set_period(struct perf_counter *counter) | ||
2110 | { | ||
2111 | struct hw_perf_counter *hwc = &counter->hw; | ||
2112 | s64 left = atomic64_read(&hwc->period_left); | ||
2113 | s64 period = hwc->irq_period; | ||
2114 | |||
2115 | if (unlikely(left <= -period)) { | ||
2116 | left = period; | ||
2117 | atomic64_set(&hwc->period_left, left); | ||
2118 | } | ||
2119 | |||
2120 | if (unlikely(left <= 0)) { | ||
2121 | left += period; | ||
2122 | atomic64_add(period, &hwc->period_left); | ||
2123 | } | ||
2124 | |||
2125 | atomic64_set(&hwc->prev_count, -left); | ||
2126 | atomic64_set(&hwc->count, -left); | ||
2127 | } | ||
2128 | |||
2129 | static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) | ||
2130 | { | ||
2131 | enum hrtimer_restart ret = HRTIMER_RESTART; | ||
2132 | struct perf_counter *counter; | ||
2133 | struct pt_regs *regs; | ||
2134 | |||
2135 | counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); | ||
2136 | counter->hw_ops->read(counter); | ||
2137 | |||
2138 | regs = get_irq_regs(); | ||
2139 | /* | ||
2140 | * In case we exclude kernel IPs or are somehow not in interrupt | ||
2141 | * context, provide the next best thing, the user IP. | ||
2142 | */ | ||
2143 | if ((counter->hw_event.exclude_kernel || !regs) && | ||
2144 | !counter->hw_event.exclude_user) | ||
2145 | regs = task_pt_regs(current); | ||
2146 | |||
2147 | if (regs) { | ||
2148 | if (perf_counter_overflow(counter, 0, regs)) | ||
2149 | ret = HRTIMER_NORESTART; | ||
2150 | } | ||
2151 | |||
2152 | hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); | ||
2153 | |||
2154 | return ret; | ||
2155 | } | ||
2156 | |||
2157 | static void perf_swcounter_overflow(struct perf_counter *counter, | ||
2158 | int nmi, struct pt_regs *regs) | ||
2159 | { | ||
2160 | perf_swcounter_update(counter); | ||
2161 | perf_swcounter_set_period(counter); | ||
2162 | if (perf_counter_overflow(counter, nmi, regs)) | ||
2163 | /* soft-disable the counter */ | ||
2164 | ; | ||
2165 | |||
2166 | } | ||
2167 | |||
2168 | static int perf_swcounter_match(struct perf_counter *counter, | ||
2169 | enum perf_event_types type, | ||
2170 | u32 event, struct pt_regs *regs) | ||
2171 | { | ||
2172 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) | ||
2173 | return 0; | ||
2174 | |||
2175 | if (perf_event_raw(&counter->hw_event)) | ||
2176 | return 0; | ||
2177 | |||
2178 | if (perf_event_type(&counter->hw_event) != type) | ||
2179 | return 0; | ||
2180 | |||
2181 | if (perf_event_id(&counter->hw_event) != event) | ||
2182 | return 0; | ||
2183 | |||
2184 | if (counter->hw_event.exclude_user && user_mode(regs)) | ||
2185 | return 0; | ||
2186 | |||
2187 | if (counter->hw_event.exclude_kernel && !user_mode(regs)) | ||
2188 | return 0; | ||
2189 | |||
2190 | return 1; | ||
2191 | } | ||
2192 | |||
2193 | static void perf_swcounter_add(struct perf_counter *counter, u64 nr, | ||
2194 | int nmi, struct pt_regs *regs) | ||
2195 | { | ||
2196 | int neg = atomic64_add_negative(nr, &counter->hw.count); | ||
2197 | if (counter->hw.irq_period && !neg) | ||
2198 | perf_swcounter_overflow(counter, nmi, regs); | ||
2199 | } | ||
2200 | |||
2201 | static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, | ||
2202 | enum perf_event_types type, u32 event, | ||
2203 | u64 nr, int nmi, struct pt_regs *regs) | ||
2204 | { | ||
2205 | struct perf_counter *counter; | ||
2206 | |||
2207 | if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) | ||
2208 | return; | ||
2209 | |||
2210 | rcu_read_lock(); | ||
2211 | list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { | ||
2212 | if (perf_swcounter_match(counter, type, event, regs)) | ||
2213 | perf_swcounter_add(counter, nr, nmi, regs); | ||
2214 | } | ||
2215 | rcu_read_unlock(); | ||
2216 | } | ||
2217 | |||
2218 | static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) | ||
2219 | { | ||
2220 | if (in_nmi()) | ||
2221 | return &cpuctx->recursion[3]; | ||
2222 | |||
2223 | if (in_irq()) | ||
2224 | return &cpuctx->recursion[2]; | ||
2225 | |||
2226 | if (in_softirq()) | ||
2227 | return &cpuctx->recursion[1]; | ||
2228 | |||
2229 | return &cpuctx->recursion[0]; | ||
2230 | } | ||
2231 | |||
2232 | static void __perf_swcounter_event(enum perf_event_types type, u32 event, | ||
2233 | u64 nr, int nmi, struct pt_regs *regs) | ||
2234 | { | ||
2235 | struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); | ||
2236 | int *recursion = perf_swcounter_recursion_context(cpuctx); | ||
2237 | |||
2238 | if (*recursion) | ||
2239 | goto out; | ||
2240 | |||
2241 | (*recursion)++; | ||
2242 | barrier(); | ||
2243 | |||
2244 | perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); | ||
2245 | if (cpuctx->task_ctx) { | ||
2246 | perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, | ||
2247 | nr, nmi, regs); | ||
2248 | } | ||
2249 | |||
2250 | barrier(); | ||
2251 | (*recursion)--; | ||
2252 | |||
2253 | out: | ||
2254 | put_cpu_var(perf_cpu_context); | ||
2255 | } | ||
2256 | |||
2257 | void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) | ||
2258 | { | ||
2259 | __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); | ||
2260 | } | ||
2261 | |||
2262 | static void perf_swcounter_read(struct perf_counter *counter) | ||
2263 | { | ||
2264 | perf_swcounter_update(counter); | ||
2265 | } | ||
2266 | |||
2267 | static int perf_swcounter_enable(struct perf_counter *counter) | ||
2268 | { | ||
2269 | perf_swcounter_set_period(counter); | ||
2270 | return 0; | ||
2271 | } | ||
2272 | |||
2273 | static void perf_swcounter_disable(struct perf_counter *counter) | ||
2274 | { | ||
2275 | perf_swcounter_update(counter); | ||
2276 | } | ||
2277 | |||
2278 | static const struct hw_perf_counter_ops perf_ops_generic = { | ||
2279 | .enable = perf_swcounter_enable, | ||
2280 | .disable = perf_swcounter_disable, | ||
2281 | .read = perf_swcounter_read, | ||
2282 | }; | ||
2283 | |||
2284 | /* | ||
2285 | * Software counter: cpu wall time clock | ||
2286 | */ | ||
2287 | |||
2288 | static void cpu_clock_perf_counter_update(struct perf_counter *counter) | ||
2289 | { | ||
2290 | int cpu = raw_smp_processor_id(); | ||
2291 | s64 prev; | ||
2292 | u64 now; | ||
2293 | |||
2294 | now = cpu_clock(cpu); | ||
2295 | prev = atomic64_read(&counter->hw.prev_count); | ||
2296 | atomic64_set(&counter->hw.prev_count, now); | ||
2297 | atomic64_add(now - prev, &counter->count); | ||
2298 | } | ||
2299 | |||
2300 | static int cpu_clock_perf_counter_enable(struct perf_counter *counter) | ||
2301 | { | ||
2302 | struct hw_perf_counter *hwc = &counter->hw; | ||
2303 | int cpu = raw_smp_processor_id(); | ||
2304 | |||
2305 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
2306 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2307 | hwc->hrtimer.function = perf_swcounter_hrtimer; | ||
2308 | if (hwc->irq_period) { | ||
2309 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
2310 | ns_to_ktime(hwc->irq_period), 0, | ||
2311 | HRTIMER_MODE_REL, 0); | ||
2312 | } | ||
2313 | |||
2314 | return 0; | ||
2315 | } | ||
2316 | |||
2317 | static void cpu_clock_perf_counter_disable(struct perf_counter *counter) | ||
2318 | { | ||
2319 | hrtimer_cancel(&counter->hw.hrtimer); | ||
2320 | cpu_clock_perf_counter_update(counter); | ||
2321 | } | ||
2322 | |||
2323 | static void cpu_clock_perf_counter_read(struct perf_counter *counter) | ||
2324 | { | ||
2325 | cpu_clock_perf_counter_update(counter); | ||
2326 | } | ||
2327 | |||
2328 | static const struct hw_perf_counter_ops perf_ops_cpu_clock = { | ||
2329 | .enable = cpu_clock_perf_counter_enable, | ||
2330 | .disable = cpu_clock_perf_counter_disable, | ||
2331 | .read = cpu_clock_perf_counter_read, | ||
2332 | }; | ||
2333 | |||
2334 | /* | ||
2335 | * Software counter: task time clock | ||
2336 | */ | ||
2337 | |||
2338 | static void task_clock_perf_counter_update(struct perf_counter *counter) | ||
2339 | { | ||
2340 | u64 prev, now; | ||
2341 | s64 delta; | ||
2342 | |||
2343 | now = counter->ctx->time; | ||
2344 | |||
2345 | prev = atomic64_xchg(&counter->hw.prev_count, now); | ||
2346 | delta = now - prev; | ||
2347 | atomic64_add(delta, &counter->count); | ||
2348 | } | ||
2349 | |||
2350 | static int task_clock_perf_counter_enable(struct perf_counter *counter) | ||
2351 | { | ||
2352 | struct hw_perf_counter *hwc = &counter->hw; | ||
2353 | u64 now; | ||
2354 | |||
2355 | now = counter->ctx->time; | ||
2356 | |||
2357 | atomic64_set(&hwc->prev_count, now); | ||
2358 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2359 | hwc->hrtimer.function = perf_swcounter_hrtimer; | ||
2360 | if (hwc->irq_period) { | ||
2361 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
2362 | ns_to_ktime(hwc->irq_period), 0, | ||
2363 | HRTIMER_MODE_REL, 0); | ||
2364 | } | ||
2365 | |||
2366 | return 0; | ||
2367 | } | ||
2368 | |||
2369 | static void task_clock_perf_counter_disable(struct perf_counter *counter) | ||
2370 | { | ||
2371 | hrtimer_cancel(&counter->hw.hrtimer); | ||
2372 | task_clock_perf_counter_update(counter); | ||
2373 | } | ||
2374 | |||
2375 | static void task_clock_perf_counter_read(struct perf_counter *counter) | ||
2376 | { | ||
2377 | update_context_time(counter->ctx); | ||
2378 | task_clock_perf_counter_update(counter); | ||
2379 | } | ||
2380 | |||
2381 | static const struct hw_perf_counter_ops perf_ops_task_clock = { | ||
2382 | .enable = task_clock_perf_counter_enable, | ||
2383 | .disable = task_clock_perf_counter_disable, | ||
2384 | .read = task_clock_perf_counter_read, | ||
2385 | }; | ||
2386 | |||
2387 | /* | ||
2388 | * Software counter: cpu migrations | ||
2389 | */ | ||
2390 | |||
2391 | static inline u64 get_cpu_migrations(struct perf_counter *counter) | ||
2392 | { | ||
2393 | struct task_struct *curr = counter->ctx->task; | ||
2394 | |||
2395 | if (curr) | ||
2396 | return curr->se.nr_migrations; | ||
2397 | return cpu_nr_migrations(smp_processor_id()); | ||
2398 | } | ||
2399 | |||
2400 | static void cpu_migrations_perf_counter_update(struct perf_counter *counter) | ||
2401 | { | ||
2402 | u64 prev, now; | ||
2403 | s64 delta; | ||
2404 | |||
2405 | prev = atomic64_read(&counter->hw.prev_count); | ||
2406 | now = get_cpu_migrations(counter); | ||
2407 | |||
2408 | atomic64_set(&counter->hw.prev_count, now); | ||
2409 | |||
2410 | delta = now - prev; | ||
2411 | |||
2412 | atomic64_add(delta, &counter->count); | ||
2413 | } | ||
2414 | |||
2415 | static void cpu_migrations_perf_counter_read(struct perf_counter *counter) | ||
2416 | { | ||
2417 | cpu_migrations_perf_counter_update(counter); | ||
2418 | } | ||
2419 | |||
2420 | static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) | ||
2421 | { | ||
2422 | if (counter->prev_state <= PERF_COUNTER_STATE_OFF) | ||
2423 | atomic64_set(&counter->hw.prev_count, | ||
2424 | get_cpu_migrations(counter)); | ||
2425 | return 0; | ||
2426 | } | ||
2427 | |||
2428 | static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) | ||
2429 | { | ||
2430 | cpu_migrations_perf_counter_update(counter); | ||
2431 | } | ||
2432 | |||
2433 | static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { | ||
2434 | .enable = cpu_migrations_perf_counter_enable, | ||
2435 | .disable = cpu_migrations_perf_counter_disable, | ||
2436 | .read = cpu_migrations_perf_counter_read, | ||
2437 | }; | ||
2438 | |||
2439 | #ifdef CONFIG_EVENT_PROFILE | ||
2440 | void perf_tpcounter_event(int event_id) | ||
2441 | { | ||
2442 | struct pt_regs *regs = get_irq_regs(); | ||
2443 | |||
2444 | if (!regs) | ||
2445 | regs = task_pt_regs(current); | ||
2446 | |||
2447 | __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); | ||
2448 | } | ||
2449 | |||
2450 | extern int ftrace_profile_enable(int); | ||
2451 | extern void ftrace_profile_disable(int); | ||
2452 | |||
2453 | static void tp_perf_counter_destroy(struct perf_counter *counter) | ||
2454 | { | ||
2455 | ftrace_profile_disable(perf_event_id(&counter->hw_event)); | ||
2456 | } | ||
2457 | |||
2458 | static const struct hw_perf_counter_ops * | ||
2459 | tp_perf_counter_init(struct perf_counter *counter) | ||
2460 | { | ||
2461 | int event_id = perf_event_id(&counter->hw_event); | ||
2462 | int ret; | ||
2463 | |||
2464 | ret = ftrace_profile_enable(event_id); | ||
2465 | if (ret) | ||
2466 | return NULL; | ||
2467 | |||
2468 | counter->destroy = tp_perf_counter_destroy; | ||
2469 | counter->hw.irq_period = counter->hw_event.irq_period; | ||
2470 | |||
2471 | return &perf_ops_generic; | ||
2472 | } | ||
2473 | #else | ||
2474 | static const struct hw_perf_counter_ops * | ||
2475 | tp_perf_counter_init(struct perf_counter *counter) | ||
2476 | { | ||
2477 | return NULL; | ||
2478 | } | ||
2479 | #endif | ||
2480 | |||
2481 | static const struct hw_perf_counter_ops * | ||
2482 | sw_perf_counter_init(struct perf_counter *counter) | ||
2483 | { | ||
2484 | struct perf_counter_hw_event *hw_event = &counter->hw_event; | ||
2485 | const struct hw_perf_counter_ops *hw_ops = NULL; | ||
2486 | struct hw_perf_counter *hwc = &counter->hw; | ||
2487 | |||
2488 | /* | ||
2489 | * Software counters (currently) can't in general distinguish | ||
2490 | * between user, kernel and hypervisor events. | ||
2491 | * However, context switches and cpu migrations are considered | ||
2492 | * to be kernel events, and page faults are never hypervisor | ||
2493 | * events. | ||
2494 | */ | ||
2495 | switch (perf_event_id(&counter->hw_event)) { | ||
2496 | case PERF_COUNT_CPU_CLOCK: | ||
2497 | hw_ops = &perf_ops_cpu_clock; | ||
2498 | |||
2499 | if (hw_event->irq_period && hw_event->irq_period < 10000) | ||
2500 | hw_event->irq_period = 10000; | ||
2501 | break; | ||
2502 | case PERF_COUNT_TASK_CLOCK: | ||
2503 | /* | ||
2504 | * If the user instantiates this as a per-cpu counter, | ||
2505 | * use the cpu_clock counter instead. | ||
2506 | */ | ||
2507 | if (counter->ctx->task) | ||
2508 | hw_ops = &perf_ops_task_clock; | ||
2509 | else | ||
2510 | hw_ops = &perf_ops_cpu_clock; | ||
2511 | |||
2512 | if (hw_event->irq_period && hw_event->irq_period < 10000) | ||
2513 | hw_event->irq_period = 10000; | ||
2514 | break; | ||
2515 | case PERF_COUNT_PAGE_FAULTS: | ||
2516 | case PERF_COUNT_PAGE_FAULTS_MIN: | ||
2517 | case PERF_COUNT_PAGE_FAULTS_MAJ: | ||
2518 | case PERF_COUNT_CONTEXT_SWITCHES: | ||
2519 | hw_ops = &perf_ops_generic; | ||
2520 | break; | ||
2521 | case PERF_COUNT_CPU_MIGRATIONS: | ||
2522 | if (!counter->hw_event.exclude_kernel) | ||
2523 | hw_ops = &perf_ops_cpu_migrations; | ||
2524 | break; | ||
2525 | } | ||
2526 | |||
2527 | if (hw_ops) | ||
2528 | hwc->irq_period = hw_event->irq_period; | ||
2529 | |||
2530 | return hw_ops; | ||
2531 | } | ||
2532 | |||
2533 | /* | ||
2534 | * Allocate and initialize a counter structure | ||
2535 | */ | ||
2536 | static struct perf_counter * | ||
2537 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, | ||
2538 | int cpu, | ||
2539 | struct perf_counter_context *ctx, | ||
2540 | struct perf_counter *group_leader, | ||
2541 | gfp_t gfpflags) | ||
2542 | { | ||
2543 | const struct hw_perf_counter_ops *hw_ops; | ||
2544 | struct perf_counter *counter; | ||
2545 | long err; | ||
2546 | |||
2547 | counter = kzalloc(sizeof(*counter), gfpflags); | ||
2548 | if (!counter) | ||
2549 | return ERR_PTR(-ENOMEM); | ||
2550 | |||
2551 | /* | ||
2552 | * Single counters are their own group leaders, with an | ||
2553 | * empty sibling list: | ||
2554 | */ | ||
2555 | if (!group_leader) | ||
2556 | group_leader = counter; | ||
2557 | |||
2558 | mutex_init(&counter->mutex); | ||
2559 | INIT_LIST_HEAD(&counter->list_entry); | ||
2560 | INIT_LIST_HEAD(&counter->event_entry); | ||
2561 | INIT_LIST_HEAD(&counter->sibling_list); | ||
2562 | init_waitqueue_head(&counter->waitq); | ||
2563 | |||
2564 | mutex_init(&counter->mmap_mutex); | ||
2565 | |||
2566 | INIT_LIST_HEAD(&counter->child_list); | ||
2567 | |||
2568 | counter->cpu = cpu; | ||
2569 | counter->hw_event = *hw_event; | ||
2570 | counter->group_leader = group_leader; | ||
2571 | counter->hw_ops = NULL; | ||
2572 | counter->ctx = ctx; | ||
2573 | |||
2574 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
2575 | if (hw_event->disabled) | ||
2576 | counter->state = PERF_COUNTER_STATE_OFF; | ||
2577 | |||
2578 | hw_ops = NULL; | ||
2579 | |||
2580 | if (perf_event_raw(hw_event)) { | ||
2581 | hw_ops = hw_perf_counter_init(counter); | ||
2582 | goto done; | ||
2583 | } | ||
2584 | |||
2585 | switch (perf_event_type(hw_event)) { | ||
2586 | case PERF_TYPE_HARDWARE: | ||
2587 | hw_ops = hw_perf_counter_init(counter); | ||
2588 | break; | ||
2589 | |||
2590 | case PERF_TYPE_SOFTWARE: | ||
2591 | hw_ops = sw_perf_counter_init(counter); | ||
2592 | break; | ||
2593 | |||
2594 | case PERF_TYPE_TRACEPOINT: | ||
2595 | hw_ops = tp_perf_counter_init(counter); | ||
2596 | break; | ||
2597 | } | ||
2598 | done: | ||
2599 | err = 0; | ||
2600 | if (!hw_ops) | ||
2601 | err = -EINVAL; | ||
2602 | else if (IS_ERR(hw_ops)) | ||
2603 | err = PTR_ERR(hw_ops); | ||
2604 | |||
2605 | if (err) { | ||
2606 | kfree(counter); | ||
2607 | return ERR_PTR(err); | ||
2608 | } | ||
2609 | |||
2610 | counter->hw_ops = hw_ops; | ||
2611 | |||
2612 | return counter; | ||
2613 | } | ||
2614 | |||
2615 | /** | ||
2616 | * sys_perf_counter_open - open a performance counter, associate it to a task/cpu | ||
2617 | * | ||
2618 | * @hw_event_uptr: event type attributes for monitoring/sampling | ||
2619 | * @pid: target pid | ||
2620 | * @cpu: target cpu | ||
2621 | * @group_fd: group leader counter fd | ||
2622 | */ | ||
2623 | SYSCALL_DEFINE5(perf_counter_open, | ||
2624 | const struct perf_counter_hw_event __user *, hw_event_uptr, | ||
2625 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | ||
2626 | { | ||
2627 | struct perf_counter *counter, *group_leader; | ||
2628 | struct perf_counter_hw_event hw_event; | ||
2629 | struct perf_counter_context *ctx; | ||
2630 | struct file *counter_file = NULL; | ||
2631 | struct file *group_file = NULL; | ||
2632 | int fput_needed = 0; | ||
2633 | int fput_needed2 = 0; | ||
2634 | int ret; | ||
2635 | |||
2636 | /* for future expandability... */ | ||
2637 | if (flags) | ||
2638 | return -EINVAL; | ||
2639 | |||
2640 | if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) | ||
2641 | return -EFAULT; | ||
2642 | |||
2643 | /* | ||
2644 | * Get the target context (task or percpu): | ||
2645 | */ | ||
2646 | ctx = find_get_context(pid, cpu); | ||
2647 | if (IS_ERR(ctx)) | ||
2648 | return PTR_ERR(ctx); | ||
2649 | |||
2650 | /* | ||
2651 | * Look up the group leader (we will attach this counter to it): | ||
2652 | */ | ||
2653 | group_leader = NULL; | ||
2654 | if (group_fd != -1) { | ||
2655 | ret = -EINVAL; | ||
2656 | group_file = fget_light(group_fd, &fput_needed); | ||
2657 | if (!group_file) | ||
2658 | goto err_put_context; | ||
2659 | if (group_file->f_op != &perf_fops) | ||
2660 | goto err_put_context; | ||
2661 | |||
2662 | group_leader = group_file->private_data; | ||
2663 | /* | ||
2664 | * Do not allow a recursive hierarchy (this new sibling | ||
2665 | * becoming part of another group-sibling): | ||
2666 | */ | ||
2667 | if (group_leader->group_leader != group_leader) | ||
2668 | goto err_put_context; | ||
2669 | /* | ||
2670 | * Do not allow to attach to a group in a different | ||
2671 | * task or CPU context: | ||
2672 | */ | ||
2673 | if (group_leader->ctx != ctx) | ||
2674 | goto err_put_context; | ||
2675 | /* | ||
2676 | * Only a group leader can be exclusive or pinned | ||
2677 | */ | ||
2678 | if (hw_event.exclusive || hw_event.pinned) | ||
2679 | goto err_put_context; | ||
2680 | } | ||
2681 | |||
2682 | counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, | ||
2683 | GFP_KERNEL); | ||
2684 | ret = PTR_ERR(counter); | ||
2685 | if (IS_ERR(counter)) | ||
2686 | goto err_put_context; | ||
2687 | |||
2688 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | ||
2689 | if (ret < 0) | ||
2690 | goto err_free_put_context; | ||
2691 | |||
2692 | counter_file = fget_light(ret, &fput_needed2); | ||
2693 | if (!counter_file) | ||
2694 | goto err_free_put_context; | ||
2695 | |||
2696 | counter->filp = counter_file; | ||
2697 | mutex_lock(&ctx->mutex); | ||
2698 | perf_install_in_context(ctx, counter, cpu); | ||
2699 | mutex_unlock(&ctx->mutex); | ||
2700 | |||
2701 | fput_light(counter_file, fput_needed2); | ||
2702 | |||
2703 | out_fput: | ||
2704 | fput_light(group_file, fput_needed); | ||
2705 | |||
2706 | return ret; | ||
2707 | |||
2708 | err_free_put_context: | ||
2709 | kfree(counter); | ||
2710 | |||
2711 | err_put_context: | ||
2712 | put_context(ctx); | ||
2713 | |||
2714 | goto out_fput; | ||
2715 | } | ||
2716 | |||
2717 | /* | ||
2718 | * Initialize the perf_counter context in a task_struct: | ||
2719 | */ | ||
2720 | static void | ||
2721 | __perf_counter_init_context(struct perf_counter_context *ctx, | ||
2722 | struct task_struct *task) | ||
2723 | { | ||
2724 | memset(ctx, 0, sizeof(*ctx)); | ||
2725 | spin_lock_init(&ctx->lock); | ||
2726 | mutex_init(&ctx->mutex); | ||
2727 | INIT_LIST_HEAD(&ctx->counter_list); | ||
2728 | INIT_LIST_HEAD(&ctx->event_list); | ||
2729 | ctx->task = task; | ||
2730 | } | ||
2731 | |||
2732 | /* | ||
2733 | * inherit a counter from parent task to child task: | ||
2734 | */ | ||
2735 | static struct perf_counter * | ||
2736 | inherit_counter(struct perf_counter *parent_counter, | ||
2737 | struct task_struct *parent, | ||
2738 | struct perf_counter_context *parent_ctx, | ||
2739 | struct task_struct *child, | ||
2740 | struct perf_counter *group_leader, | ||
2741 | struct perf_counter_context *child_ctx) | ||
2742 | { | ||
2743 | struct perf_counter *child_counter; | ||
2744 | |||
2745 | /* | ||
2746 | * Instead of creating recursive hierarchies of counters, | ||
2747 | * we link inherited counters back to the original parent, | ||
2748 | * which has a filp for sure, which we use as the reference | ||
2749 | * count: | ||
2750 | */ | ||
2751 | if (parent_counter->parent) | ||
2752 | parent_counter = parent_counter->parent; | ||
2753 | |||
2754 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | ||
2755 | parent_counter->cpu, child_ctx, | ||
2756 | group_leader, GFP_KERNEL); | ||
2757 | if (IS_ERR(child_counter)) | ||
2758 | return child_counter; | ||
2759 | |||
2760 | /* | ||
2761 | * Link it up in the child's context: | ||
2762 | */ | ||
2763 | child_counter->task = child; | ||
2764 | add_counter_to_ctx(child_counter, child_ctx); | ||
2765 | |||
2766 | child_counter->parent = parent_counter; | ||
2767 | /* | ||
2768 | * inherit into child's child as well: | ||
2769 | */ | ||
2770 | child_counter->hw_event.inherit = 1; | ||
2771 | |||
2772 | /* | ||
2773 | * Get a reference to the parent filp - we will fput it | ||
2774 | * when the child counter exits. This is safe to do because | ||
2775 | * we are in the parent and we know that the filp still | ||
2776 | * exists and has a nonzero count: | ||
2777 | */ | ||
2778 | atomic_long_inc(&parent_counter->filp->f_count); | ||
2779 | |||
2780 | /* | ||
2781 | * Link this into the parent counter's child list | ||
2782 | */ | ||
2783 | mutex_lock(&parent_counter->mutex); | ||
2784 | list_add_tail(&child_counter->child_list, &parent_counter->child_list); | ||
2785 | |||
2786 | /* | ||
2787 | * Make the child state follow the state of the parent counter, | ||
2788 | * not its hw_event.disabled bit. We hold the parent's mutex, | ||
2789 | * so we won't race with perf_counter_{en,dis}able_family. | ||
2790 | */ | ||
2791 | if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) | ||
2792 | child_counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
2793 | else | ||
2794 | child_counter->state = PERF_COUNTER_STATE_OFF; | ||
2795 | |||
2796 | mutex_unlock(&parent_counter->mutex); | ||
2797 | |||
2798 | return child_counter; | ||
2799 | } | ||
2800 | |||
2801 | static int inherit_group(struct perf_counter *parent_counter, | ||
2802 | struct task_struct *parent, | ||
2803 | struct perf_counter_context *parent_ctx, | ||
2804 | struct task_struct *child, | ||
2805 | struct perf_counter_context *child_ctx) | ||
2806 | { | ||
2807 | struct perf_counter *leader; | ||
2808 | struct perf_counter *sub; | ||
2809 | struct perf_counter *child_ctr; | ||
2810 | |||
2811 | leader = inherit_counter(parent_counter, parent, parent_ctx, | ||
2812 | child, NULL, child_ctx); | ||
2813 | if (IS_ERR(leader)) | ||
2814 | return PTR_ERR(leader); | ||
2815 | list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { | ||
2816 | child_ctr = inherit_counter(sub, parent, parent_ctx, | ||
2817 | child, leader, child_ctx); | ||
2818 | if (IS_ERR(child_ctr)) | ||
2819 | return PTR_ERR(child_ctr); | ||
2820 | } | ||
2821 | return 0; | ||
2822 | } | ||
2823 | |||
2824 | static void sync_child_counter(struct perf_counter *child_counter, | ||
2825 | struct perf_counter *parent_counter) | ||
2826 | { | ||
2827 | u64 parent_val, child_val; | ||
2828 | |||
2829 | parent_val = atomic64_read(&parent_counter->count); | ||
2830 | child_val = atomic64_read(&child_counter->count); | ||
2831 | |||
2832 | /* | ||
2833 | * Add back the child's count to the parent's count: | ||
2834 | */ | ||
2835 | atomic64_add(child_val, &parent_counter->count); | ||
2836 | atomic64_add(child_counter->total_time_enabled, | ||
2837 | &parent_counter->child_total_time_enabled); | ||
2838 | atomic64_add(child_counter->total_time_running, | ||
2839 | &parent_counter->child_total_time_running); | ||
2840 | |||
2841 | /* | ||
2842 | * Remove this counter from the parent's list | ||
2843 | */ | ||
2844 | mutex_lock(&parent_counter->mutex); | ||
2845 | list_del_init(&child_counter->child_list); | ||
2846 | mutex_unlock(&parent_counter->mutex); | ||
2847 | |||
2848 | /* | ||
2849 | * Release the parent counter, if this was the last | ||
2850 | * reference to it. | ||
2851 | */ | ||
2852 | fput(parent_counter->filp); | ||
2853 | } | ||
2854 | |||
2855 | static void | ||
2856 | __perf_counter_exit_task(struct task_struct *child, | ||
2857 | struct perf_counter *child_counter, | ||
2858 | struct perf_counter_context *child_ctx) | ||
2859 | { | ||
2860 | struct perf_counter *parent_counter; | ||
2861 | struct perf_counter *sub, *tmp; | ||
2862 | |||
2863 | /* | ||
2864 | * If we do not self-reap then we have to wait for the | ||
2865 | * child task to unschedule (it will happen for sure), | ||
2866 | * so that its counter is at its final count. (This | ||
2867 | * condition triggers rarely - child tasks usually get | ||
2868 | * off their CPU before the parent has a chance to | ||
2869 | * get this far into the reaping action) | ||
2870 | */ | ||
2871 | if (child != current) { | ||
2872 | wait_task_inactive(child, 0); | ||
2873 | list_del_init(&child_counter->list_entry); | ||
2874 | update_counter_times(child_counter); | ||
2875 | } else { | ||
2876 | struct perf_cpu_context *cpuctx; | ||
2877 | unsigned long flags; | ||
2878 | u64 perf_flags; | ||
2879 | |||
2880 | /* | ||
2881 | * Disable and unlink this counter. | ||
2882 | * | ||
2883 | * Be careful about zapping the list - IRQ/NMI context | ||
2884 | * could still be processing it: | ||
2885 | */ | ||
2886 | local_irq_save(flags); | ||
2887 | perf_flags = hw_perf_save_disable(); | ||
2888 | |||
2889 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
2890 | |||
2891 | group_sched_out(child_counter, cpuctx, child_ctx); | ||
2892 | update_counter_times(child_counter); | ||
2893 | |||
2894 | list_del_init(&child_counter->list_entry); | ||
2895 | |||
2896 | child_ctx->nr_counters--; | ||
2897 | |||
2898 | hw_perf_restore(perf_flags); | ||
2899 | local_irq_restore(flags); | ||
2900 | } | ||
2901 | |||
2902 | parent_counter = child_counter->parent; | ||
2903 | /* | ||
2904 | * It can happen that parent exits first, and has counters | ||
2905 | * that are still around due to the child reference. These | ||
2906 | * counters need to be zapped - but otherwise linger. | ||
2907 | */ | ||
2908 | if (parent_counter) { | ||
2909 | sync_child_counter(child_counter, parent_counter); | ||
2910 | list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, | ||
2911 | list_entry) { | ||
2912 | if (sub->parent) { | ||
2913 | sync_child_counter(sub, sub->parent); | ||
2914 | free_counter(sub); | ||
2915 | } | ||
2916 | } | ||
2917 | free_counter(child_counter); | ||
2918 | } | ||
2919 | } | ||
2920 | |||
2921 | /* | ||
2922 | * When a child task exits, feed back counter values to parent counters. | ||
2923 | * | ||
2924 | * Note: we may be running in child context, but the PID is not hashed | ||
2925 | * anymore so new counters will not be added. | ||
2926 | */ | ||
2927 | void perf_counter_exit_task(struct task_struct *child) | ||
2928 | { | ||
2929 | struct perf_counter *child_counter, *tmp; | ||
2930 | struct perf_counter_context *child_ctx; | ||
2931 | |||
2932 | child_ctx = &child->perf_counter_ctx; | ||
2933 | |||
2934 | if (likely(!child_ctx->nr_counters)) | ||
2935 | return; | ||
2936 | |||
2937 | list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, | ||
2938 | list_entry) | ||
2939 | __perf_counter_exit_task(child, child_counter, child_ctx); | ||
2940 | } | ||
2941 | |||
2942 | /* | ||
2943 | * Initialize the perf_counter context in task_struct | ||
2944 | */ | ||
2945 | void perf_counter_init_task(struct task_struct *child) | ||
2946 | { | ||
2947 | struct perf_counter_context *child_ctx, *parent_ctx; | ||
2948 | struct perf_counter *counter; | ||
2949 | struct task_struct *parent = current; | ||
2950 | |||
2951 | child_ctx = &child->perf_counter_ctx; | ||
2952 | parent_ctx = &parent->perf_counter_ctx; | ||
2953 | |||
2954 | __perf_counter_init_context(child_ctx, child); | ||
2955 | |||
2956 | /* | ||
2957 | * This is executed from the parent task context, so inherit | ||
2958 | * counters that have been marked for cloning: | ||
2959 | */ | ||
2960 | |||
2961 | if (likely(!parent_ctx->nr_counters)) | ||
2962 | return; | ||
2963 | |||
2964 | /* | ||
2965 | * Lock the parent list. No need to lock the child - not PID | ||
2966 | * hashed yet and not running, so nobody can access it. | ||
2967 | */ | ||
2968 | mutex_lock(&parent_ctx->mutex); | ||
2969 | |||
2970 | /* | ||
2971 | * We dont have to disable NMIs - we are only looking at | ||
2972 | * the list, not manipulating it: | ||
2973 | */ | ||
2974 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { | ||
2975 | if (!counter->hw_event.inherit) | ||
2976 | continue; | ||
2977 | |||
2978 | if (inherit_group(counter, parent, | ||
2979 | parent_ctx, child, child_ctx)) | ||
2980 | break; | ||
2981 | } | ||
2982 | |||
2983 | mutex_unlock(&parent_ctx->mutex); | ||
2984 | } | ||
2985 | |||
2986 | static void __cpuinit perf_counter_init_cpu(int cpu) | ||
2987 | { | ||
2988 | struct perf_cpu_context *cpuctx; | ||
2989 | |||
2990 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
2991 | __perf_counter_init_context(&cpuctx->ctx, NULL); | ||
2992 | |||
2993 | mutex_lock(&perf_resource_mutex); | ||
2994 | cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; | ||
2995 | mutex_unlock(&perf_resource_mutex); | ||
2996 | |||
2997 | hw_perf_counter_setup(cpu); | ||
2998 | } | ||
2999 | |||
3000 | #ifdef CONFIG_HOTPLUG_CPU | ||
3001 | static void __perf_counter_exit_cpu(void *info) | ||
3002 | { | ||
3003 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
3004 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
3005 | struct perf_counter *counter, *tmp; | ||
3006 | |||
3007 | list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) | ||
3008 | __perf_counter_remove_from_context(counter); | ||
3009 | } | ||
3010 | static void perf_counter_exit_cpu(int cpu) | ||
3011 | { | ||
3012 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
3013 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
3014 | |||
3015 | mutex_lock(&ctx->mutex); | ||
3016 | smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); | ||
3017 | mutex_unlock(&ctx->mutex); | ||
3018 | } | ||
3019 | #else | ||
3020 | static inline void perf_counter_exit_cpu(int cpu) { } | ||
3021 | #endif | ||
3022 | |||
3023 | static int __cpuinit | ||
3024 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | ||
3025 | { | ||
3026 | unsigned int cpu = (long)hcpu; | ||
3027 | |||
3028 | switch (action) { | ||
3029 | |||
3030 | case CPU_UP_PREPARE: | ||
3031 | case CPU_UP_PREPARE_FROZEN: | ||
3032 | perf_counter_init_cpu(cpu); | ||
3033 | break; | ||
3034 | |||
3035 | case CPU_DOWN_PREPARE: | ||
3036 | case CPU_DOWN_PREPARE_FROZEN: | ||
3037 | perf_counter_exit_cpu(cpu); | ||
3038 | break; | ||
3039 | |||
3040 | default: | ||
3041 | break; | ||
3042 | } | ||
3043 | |||
3044 | return NOTIFY_OK; | ||
3045 | } | ||
3046 | |||
3047 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
3048 | .notifier_call = perf_cpu_notify, | ||
3049 | }; | ||
3050 | |||
3051 | static int __init perf_counter_init(void) | ||
3052 | { | ||
3053 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | ||
3054 | (void *)(long)smp_processor_id()); | ||
3055 | register_cpu_notifier(&perf_cpu_nb); | ||
3056 | |||
3057 | return 0; | ||
3058 | } | ||
3059 | early_initcall(perf_counter_init); | ||
3060 | |||
3061 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | ||
3062 | { | ||
3063 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
3064 | } | ||
3065 | |||
3066 | static ssize_t | ||
3067 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
3068 | const char *buf, | ||
3069 | size_t count) | ||
3070 | { | ||
3071 | struct perf_cpu_context *cpuctx; | ||
3072 | unsigned long val; | ||
3073 | int err, cpu, mpt; | ||
3074 | |||
3075 | err = strict_strtoul(buf, 10, &val); | ||
3076 | if (err) | ||
3077 | return err; | ||
3078 | if (val > perf_max_counters) | ||
3079 | return -EINVAL; | ||
3080 | |||
3081 | mutex_lock(&perf_resource_mutex); | ||
3082 | perf_reserved_percpu = val; | ||
3083 | for_each_online_cpu(cpu) { | ||
3084 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
3085 | spin_lock_irq(&cpuctx->ctx.lock); | ||
3086 | mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, | ||
3087 | perf_max_counters - perf_reserved_percpu); | ||
3088 | cpuctx->max_pertask = mpt; | ||
3089 | spin_unlock_irq(&cpuctx->ctx.lock); | ||
3090 | } | ||
3091 | mutex_unlock(&perf_resource_mutex); | ||
3092 | |||
3093 | return count; | ||
3094 | } | ||
3095 | |||
3096 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | ||
3097 | { | ||
3098 | return sprintf(buf, "%d\n", perf_overcommit); | ||
3099 | } | ||
3100 | |||
3101 | static ssize_t | ||
3102 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | ||
3103 | { | ||
3104 | unsigned long val; | ||
3105 | int err; | ||
3106 | |||
3107 | err = strict_strtoul(buf, 10, &val); | ||
3108 | if (err) | ||
3109 | return err; | ||
3110 | if (val > 1) | ||
3111 | return -EINVAL; | ||
3112 | |||
3113 | mutex_lock(&perf_resource_mutex); | ||
3114 | perf_overcommit = val; | ||
3115 | mutex_unlock(&perf_resource_mutex); | ||
3116 | |||
3117 | return count; | ||
3118 | } | ||
3119 | |||
3120 | static SYSDEV_CLASS_ATTR( | ||
3121 | reserve_percpu, | ||
3122 | 0644, | ||
3123 | perf_show_reserve_percpu, | ||
3124 | perf_set_reserve_percpu | ||
3125 | ); | ||
3126 | |||
3127 | static SYSDEV_CLASS_ATTR( | ||
3128 | overcommit, | ||
3129 | 0644, | ||
3130 | perf_show_overcommit, | ||
3131 | perf_set_overcommit | ||
3132 | ); | ||
3133 | |||
3134 | static struct attribute *perfclass_attrs[] = { | ||
3135 | &attr_reserve_percpu.attr, | ||
3136 | &attr_overcommit.attr, | ||
3137 | NULL | ||
3138 | }; | ||
3139 | |||
3140 | static struct attribute_group perfclass_attr_group = { | ||
3141 | .attrs = perfclass_attrs, | ||
3142 | .name = "perf_counters", | ||
3143 | }; | ||
3144 | |||
3145 | static int __init perf_counter_sysfs_init(void) | ||
3146 | { | ||
3147 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
3148 | &perfclass_attr_group); | ||
3149 | } | ||
3150 | device_initcall(perf_counter_sysfs_init); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d5072..b66a08c2480e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -584,6 +584,7 @@ struct rq { | |||
584 | struct load_weight load; | 584 | struct load_weight load; |
585 | unsigned long nr_load_updates; | 585 | unsigned long nr_load_updates; |
586 | u64 nr_switches; | 586 | u64 nr_switches; |
587 | u64 nr_migrations_in; | ||
587 | 588 | ||
588 | struct cfs_rq cfs; | 589 | struct cfs_rq cfs; |
589 | struct rt_rq rt; | 590 | struct rt_rq rt; |
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq) | |||
692 | #define task_rq(p) cpu_rq(task_cpu(p)) | 693 | #define task_rq(p) cpu_rq(task_cpu(p)) |
693 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 694 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
694 | 695 | ||
695 | static inline void update_rq_clock(struct rq *rq) | 696 | inline void update_rq_clock(struct rq *rq) |
696 | { | 697 | { |
697 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 698 | rq->clock = sched_clock_cpu(cpu_of(rq)); |
698 | } | 699 | } |
@@ -1955,12 +1956,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1955 | p->se.sleep_start -= clock_offset; | 1956 | p->se.sleep_start -= clock_offset; |
1956 | if (p->se.block_start) | 1957 | if (p->se.block_start) |
1957 | p->se.block_start -= clock_offset; | 1958 | p->se.block_start -= clock_offset; |
1959 | #endif | ||
1958 | if (old_cpu != new_cpu) { | 1960 | if (old_cpu != new_cpu) { |
1959 | schedstat_inc(p, se.nr_migrations); | 1961 | p->se.nr_migrations++; |
1962 | new_rq->nr_migrations_in++; | ||
1963 | #ifdef CONFIG_SCHEDSTATS | ||
1960 | if (task_hot(p, old_rq->clock, NULL)) | 1964 | if (task_hot(p, old_rq->clock, NULL)) |
1961 | schedstat_inc(p, se.nr_forced2_migrations); | 1965 | schedstat_inc(p, se.nr_forced2_migrations); |
1962 | } | ||
1963 | #endif | 1966 | #endif |
1967 | } | ||
1964 | p->se.vruntime -= old_cfsrq->min_vruntime - | 1968 | p->se.vruntime -= old_cfsrq->min_vruntime - |
1965 | new_cfsrq->min_vruntime; | 1969 | new_cfsrq->min_vruntime; |
1966 | 1970 | ||
@@ -2312,6 +2316,27 @@ static int sched_balance_self(int cpu, int flag) | |||
2312 | 2316 | ||
2313 | #endif /* CONFIG_SMP */ | 2317 | #endif /* CONFIG_SMP */ |
2314 | 2318 | ||
2319 | /** | ||
2320 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2321 | * @p: the task to evaluate | ||
2322 | * @func: the function to be called | ||
2323 | * @info: the function call argument | ||
2324 | * | ||
2325 | * Calls the function @func when the task is currently running. This might | ||
2326 | * be on the current CPU, which just calls the function directly | ||
2327 | */ | ||
2328 | void task_oncpu_function_call(struct task_struct *p, | ||
2329 | void (*func) (void *info), void *info) | ||
2330 | { | ||
2331 | int cpu; | ||
2332 | |||
2333 | preempt_disable(); | ||
2334 | cpu = task_cpu(p); | ||
2335 | if (task_curr(p)) | ||
2336 | smp_call_function_single(cpu, func, info, 1); | ||
2337 | preempt_enable(); | ||
2338 | } | ||
2339 | |||
2315 | /*** | 2340 | /*** |
2316 | * try_to_wake_up - wake up a thread | 2341 | * try_to_wake_up - wake up a thread |
2317 | * @p: the to-be-woken-up thread | 2342 | * @p: the to-be-woken-up thread |
@@ -2468,6 +2493,7 @@ static void __sched_fork(struct task_struct *p) | |||
2468 | p->se.exec_start = 0; | 2493 | p->se.exec_start = 0; |
2469 | p->se.sum_exec_runtime = 0; | 2494 | p->se.sum_exec_runtime = 0; |
2470 | p->se.prev_sum_exec_runtime = 0; | 2495 | p->se.prev_sum_exec_runtime = 0; |
2496 | p->se.nr_migrations = 0; | ||
2471 | p->se.last_wakeup = 0; | 2497 | p->se.last_wakeup = 0; |
2472 | p->se.avg_overlap = 0; | 2498 | p->se.avg_overlap = 0; |
2473 | p->se.start_runtime = 0; | 2499 | p->se.start_runtime = 0; |
@@ -2698,6 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2698 | */ | 2724 | */ |
2699 | prev_state = prev->state; | 2725 | prev_state = prev->state; |
2700 | finish_arch_switch(prev); | 2726 | finish_arch_switch(prev); |
2727 | perf_counter_task_sched_in(current, cpu_of(rq)); | ||
2701 | finish_lock_switch(rq, prev); | 2728 | finish_lock_switch(rq, prev); |
2702 | #ifdef CONFIG_SMP | 2729 | #ifdef CONFIG_SMP |
2703 | if (post_schedule) | 2730 | if (post_schedule) |
@@ -2860,6 +2887,15 @@ unsigned long nr_active(void) | |||
2860 | } | 2887 | } |
2861 | 2888 | ||
2862 | /* | 2889 | /* |
2890 | * Externally visible per-cpu scheduler statistics: | ||
2891 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
2892 | */ | ||
2893 | u64 cpu_nr_migrations(int cpu) | ||
2894 | { | ||
2895 | return cpu_rq(cpu)->nr_migrations_in; | ||
2896 | } | ||
2897 | |||
2898 | /* | ||
2863 | * Update rq->cpu_load[] statistics. This function is usually called every | 2899 | * Update rq->cpu_load[] statistics. This function is usually called every |
2864 | * scheduler tick (TICK_NSEC). | 2900 | * scheduler tick (TICK_NSEC). |
2865 | */ | 2901 | */ |
@@ -4514,6 +4550,29 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
4514 | * Return any ns on the sched_clock that have not yet been banked in | 4550 | * Return any ns on the sched_clock that have not yet been banked in |
4515 | * @p in case that task is currently running. | 4551 | * @p in case that task is currently running. |
4516 | */ | 4552 | */ |
4553 | unsigned long long __task_delta_exec(struct task_struct *p, int update) | ||
4554 | { | ||
4555 | s64 delta_exec; | ||
4556 | struct rq *rq; | ||
4557 | |||
4558 | rq = task_rq(p); | ||
4559 | WARN_ON_ONCE(!runqueue_is_locked()); | ||
4560 | WARN_ON_ONCE(!task_current(rq, p)); | ||
4561 | |||
4562 | if (update) | ||
4563 | update_rq_clock(rq); | ||
4564 | |||
4565 | delta_exec = rq->clock - p->se.exec_start; | ||
4566 | |||
4567 | WARN_ON_ONCE(delta_exec < 0); | ||
4568 | |||
4569 | return delta_exec; | ||
4570 | } | ||
4571 | |||
4572 | /* | ||
4573 | * Return any ns on the sched_clock that have not yet been banked in | ||
4574 | * @p in case that task is currently running. | ||
4575 | */ | ||
4517 | unsigned long long task_delta_exec(struct task_struct *p) | 4576 | unsigned long long task_delta_exec(struct task_struct *p) |
4518 | { | 4577 | { |
4519 | unsigned long flags; | 4578 | unsigned long flags; |
@@ -4773,6 +4832,7 @@ void scheduler_tick(void) | |||
4773 | update_rq_clock(rq); | 4832 | update_rq_clock(rq); |
4774 | update_cpu_load(rq); | 4833 | update_cpu_load(rq); |
4775 | curr->sched_class->task_tick(rq, curr, 0); | 4834 | curr->sched_class->task_tick(rq, curr, 0); |
4835 | perf_counter_task_tick(curr, cpu); | ||
4776 | spin_unlock(&rq->lock); | 4836 | spin_unlock(&rq->lock); |
4777 | 4837 | ||
4778 | #ifdef CONFIG_SMP | 4838 | #ifdef CONFIG_SMP |
@@ -4988,6 +5048,7 @@ need_resched_nonpreemptible: | |||
4988 | 5048 | ||
4989 | if (likely(prev != next)) { | 5049 | if (likely(prev != next)) { |
4990 | sched_info_switch(prev, next); | 5050 | sched_info_switch(prev, next); |
5051 | perf_counter_task_sched_out(prev, cpu); | ||
4991 | 5052 | ||
4992 | rq->nr_switches++; | 5053 | rq->nr_switches++; |
4993 | rq->curr = next; | 5054 | rq->curr = next; |
diff --git a/kernel/sys.c b/kernel/sys.c index 51dbb55604e8..14c4c5613118 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/prctl.h> | 14 | #include <linux/prctl.h> |
15 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
16 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
17 | #include <linux/perf_counter.h> | ||
17 | #include <linux/resource.h> | 18 | #include <linux/resource.h> |
18 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
19 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
@@ -1799,6 +1800,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1799 | case PR_SET_TSC: | 1800 | case PR_SET_TSC: |
1800 | error = SET_TSC_CTL(arg2); | 1801 | error = SET_TSC_CTL(arg2); |
1801 | break; | 1802 | break; |
1803 | case PR_TASK_PERF_COUNTERS_DISABLE: | ||
1804 | error = perf_counter_task_disable(); | ||
1805 | break; | ||
1806 | case PR_TASK_PERF_COUNTERS_ENABLE: | ||
1807 | error = perf_counter_task_enable(); | ||
1808 | break; | ||
1802 | case PR_GET_TIMERSLACK: | 1809 | case PR_GET_TIMERSLACK: |
1803 | error = current->timer_slack_ns; | 1810 | error = current->timer_slack_ns; |
1804 | break; | 1811 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 27dad2967387..68320f6b07b5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime); | |||
175 | cond_syscall(compat_sys_timerfd_gettime); | 175 | cond_syscall(compat_sys_timerfd_gettime); |
176 | cond_syscall(sys_eventfd); | 176 | cond_syscall(sys_eventfd); |
177 | cond_syscall(sys_eventfd2); | 177 | cond_syscall(sys_eventfd2); |
178 | |||
179 | /* performance counters: */ | ||
180 | cond_syscall(sys_perf_counter_open); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index b4555568b4e4..672ca25fbc43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/perf_counter.h> | ||
40 | 41 | ||
41 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
42 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h) | |||
1167 | { | 1168 | { |
1168 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1169 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
1169 | 1170 | ||
1171 | perf_counter_do_pending(); | ||
1172 | |||
1170 | hrtimer_run_pending(); | 1173 | hrtimer_run_pending(); |
1171 | 1174 | ||
1172 | if (time_after_eq(jiffies, base->timer_jiffies)) | 1175 | if (time_after_eq(jiffies, base->timer_jiffies)) |
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/mempolicy.h> | 28 | #include <linux/mempolicy.h> |
29 | #include <linux/rmap.h> | 29 | #include <linux/rmap.h> |
30 | #include <linux/mmu_notifier.h> | 30 | #include <linux/mmu_notifier.h> |
31 | #include <linux/perf_counter.h> | ||
31 | 32 | ||
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
33 | #include <asm/cacheflush.h> | 34 | #include <asm/cacheflush.h> |
@@ -1223,6 +1224,9 @@ munmap_back: | |||
1223 | if (correct_wcount) | 1224 | if (correct_wcount) |
1224 | atomic_inc(&inode->i_writecount); | 1225 | atomic_inc(&inode->i_writecount); |
1225 | out: | 1226 | out: |
1227 | if (vm_flags & VM_EXEC) | ||
1228 | perf_counter_mmap(addr, len, pgoff, file); | ||
1229 | |||
1226 | mm->total_vm += len >> PAGE_SHIFT; | 1230 | mm->total_vm += len >> PAGE_SHIFT; |
1227 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1231 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1228 | if (vm_flags & VM_LOCKED) { | 1232 | if (vm_flags & VM_LOCKED) { |
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
1756 | do { | 1760 | do { |
1757 | long nrpages = vma_pages(vma); | 1761 | long nrpages = vma_pages(vma); |
1758 | 1762 | ||
1763 | if (vma->vm_flags & VM_EXEC) { | ||
1764 | perf_counter_munmap(vma->vm_start, | ||
1765 | nrpages << PAGE_SHIFT, | ||
1766 | vma->vm_pgoff, vma->vm_file); | ||
1767 | } | ||
1768 | |||
1759 | mm->total_vm -= nrpages; | 1769 | mm->total_vm -= nrpages; |
1760 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1770 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1761 | vma = remove_vma(vma); | 1771 | vma = remove_vma(vma); |