aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf_counter/Makefile12
-rw-r--r--Documentation/perf_counter/design.txt283
-rw-r--r--Documentation/perf_counter/kerneltop.c1409
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h72
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h3
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c846
-rw-r--r--arch/powerpc/kernel/power4-pmu.c557
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c452
-rw-r--r--arch/powerpc/kernel/power5-pmu.c475
-rw-r--r--arch/powerpc/kernel/power6-pmu.c283
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c375
-rw-r--r--arch/powerpc/mm/fault.c8
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/entry_arch.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1213
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S7
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit_32.c60
-rw-r--r--arch/x86/kernel/irqinit_64.c13
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--drivers/acpi/processor_idle.c4
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c8
-rw-r--r--include/linux/init_task.h13
-rw-r--r--include/linux/kernel_stat.h6
-rw-r--r--include/linux/mutex.h23
-rw-r--r--include/linux/perf_counter.h591
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--init/Kconfig35
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/perf_counter.c3150
-rw-r--r--kernel/sched.c67
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/timer.c3
-rw-r--r--mm/mmap.c10
68 files changed, 10460 insertions, 93 deletions
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 000000000000..194b66215588
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,12 @@
1BINS = kerneltop perfstat
2
3all: $(BINS)
4
5kerneltop: kerneltop.c ../../include/linux/perf_counter.h
6 cc -O6 -Wall -lrt -o $@ $<
7
8perfstat: kerneltop
9 ln -sf kerneltop perfstat
10
11clean:
12 rm $(BINS)
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
new file mode 100644
index 000000000000..aaf105c02fba
--- /dev/null
+++ b/Documentation/perf_counter/design.txt
@@ -0,0 +1,283 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
17
18Performance counters are accessed via special file descriptors.
19There's one file descriptor per virtual counter used.
20
21The special file descriptor is opened via the perf_counter_open()
22system call:
23
24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
27
28The syscall returns the new fd. The fd can be used via the normal
29VFS system calls: read() can be used to read the counter, fcntl()
30can be used to set the blocking mode, etc.
31
32Multiple counters can be kept open at a time, and the counters
33can be poll()ed.
34
35When creating a new counter fd, 'perf_counter_hw_event' is:
36
37/*
38 * Event to monitor via a performance monitoring counter:
39 */
40struct perf_counter_hw_event {
41 __u64 event_config;
42
43 __u64 irq_period;
44 __u64 record_type;
45 __u64 read_format;
46
47 __u64 disabled : 1, /* off by default */
48 nmi : 1, /* NMI sampling */
49 inherit : 1, /* children inherit it */
50 pinned : 1, /* must always be on PMU */
51 exclusive : 1, /* only group on PMU */
52 exclude_user : 1, /* don't count user */
53 exclude_kernel : 1, /* ditto kernel */
54 exclude_hv : 1, /* ditto hypervisor */
55 exclude_idle : 1, /* don't count when idle */
56
57 __reserved_1 : 55;
58
59 __u32 extra_config_len;
60
61 __u32 __reserved_4;
62 __u64 __reserved_2;
63 __u64 __reserved_3;
64};
65
66The 'event_config' field specifies what the counter should count. It
67is divided into 3 bit-fields:
68
69raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
70type: 7 bits (next most significant) 0x7f00_0000_0000_0000
71event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
72
73If 'raw_type' is 1, then the counter will count a hardware event
74specified by the remaining 63 bits of event_config. The encoding is
75machine-specific.
76
77If 'raw_type' is 0, then the 'type' field says what kind of counter
78this is, with the following encoding:
79
80enum perf_event_types {
81 PERF_TYPE_HARDWARE = 0,
82 PERF_TYPE_SOFTWARE = 1,
83 PERF_TYPE_TRACEPOINT = 2,
84};
85
86A counter of PERF_TYPE_HARDWARE will count the hardware event
87specified by 'event_id':
88
89/*
90 * Generalized performance counter event types, used by the hw_event.event_id
91 * parameter of the sys_perf_counter_open() syscall:
92 */
93enum hw_event_ids {
94 /*
95 * Common hardware events, generalized by the kernel:
96 */
97 PERF_COUNT_CPU_CYCLES = 0,
98 PERF_COUNT_INSTRUCTIONS = 1,
99 PERF_COUNT_CACHE_REFERENCES = 2,
100 PERF_COUNT_CACHE_MISSES = 3,
101 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
102 PERF_COUNT_BRANCH_MISSES = 5,
103 PERF_COUNT_BUS_CYCLES = 6,
104};
105
106These are standardized types of events that work relatively uniformly
107on all CPUs that implement Performance Counters support under Linux,
108although there may be variations (e.g., different CPUs might count
109cache references and misses at different levels of the cache hierarchy).
110If a CPU is not able to count the selected event, then the system call
111will return -EINVAL.
112
113More hw_event_types are supported as well, but they are CPU-specific
114and accessed as raw events. For example, to count "External bus
115cycles while bus lock signal asserted" events on Intel Core CPUs, pass
116in a 0x4064 event_id value and set hw_event.raw_type to 1.
117
118A counter of type PERF_TYPE_SOFTWARE will count one of the available
119software events, selected by 'event_id':
120
121/*
122 * Special "software" counters provided by the kernel, even if the hardware
123 * does not support performance counters. These counters measure various
124 * physical and sw events of the kernel (and allow the profiling of them as
125 * well):
126 */
127enum sw_event_ids {
128 PERF_COUNT_CPU_CLOCK = 0,
129 PERF_COUNT_TASK_CLOCK = 1,
130 PERF_COUNT_PAGE_FAULTS = 2,
131 PERF_COUNT_CONTEXT_SWITCHES = 3,
132 PERF_COUNT_CPU_MIGRATIONS = 4,
133 PERF_COUNT_PAGE_FAULTS_MIN = 5,
134 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
135};
136
137Counters come in two flavours: counting counters and sampling
138counters. A "counting" counter is one that is used for counting the
139number of events that occur, and is characterised by having
140irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
141counting counter simply returns the current value of the counter as
142an 8-byte number.
143
144A "sampling" counter is one that is set up to generate an interrupt
145every N events, where N is given by 'irq_period'. A sampling counter
146has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
147record_type controls what data is recorded on each interrupt, and the
148available values are currently:
149
150/*
151 * IRQ-notification data record type:
152 */
153enum perf_counter_record_type {
154 PERF_RECORD_SIMPLE = 0,
155 PERF_RECORD_IRQ = 1,
156 PERF_RECORD_GROUP = 2,
157};
158
159A record_type value of PERF_RECORD_IRQ will record the instruction
160pointer (IP) at which the interrupt occurred. A record_type value of
161PERF_RECORD_GROUP will record the event_config and counter value of
162all of the other counters in the group, and should only be used on a
163group leader (see below). Currently these two values are mutually
164exclusive, but record_type will become a bit-mask in future and
165support other values.
166
167A sampling counter has an event queue, into which an event is placed
168on each interrupt. A read() on a sampling counter will read the next
169event from the event queue. If the queue is empty, the read() will
170either block or return an EAGAIN error, depending on whether the fd
171has been set to non-blocking mode or not.
172
173The 'disabled' bit specifies whether the counter starts out disabled
174or enabled. If it is initially disabled, it can be enabled by ioctl
175or prctl (see below).
176
177The 'nmi' bit specifies, for hardware events, whether the counter
178should be set up to request non-maskable interrupts (NMIs) or normal
179interrupts. This bit is ignored if the user doesn't have
180CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
181generate NMIs from hardware counters.
182
183The 'inherit' bit, if set, specifies that this counter should count
184events on descendant tasks as well as the task specified. This only
185applies to new descendents, not to any existing descendents at the
186time the counter is created (nor to any new descendents of existing
187descendents).
188
189The 'pinned' bit, if set, specifies that the counter should always be
190on the CPU if at all possible. It only applies to hardware counters
191and only to group leaders. If a pinned counter cannot be put onto the
192CPU (e.g. because there are not enough hardware counters or because of
193a conflict with some other event), then the counter goes into an
194'error' state, where reads return end-of-file (i.e. read() returns 0)
195until the counter is subsequently enabled or disabled.
196
197The 'exclusive' bit, if set, specifies that when this counter's group
198is on the CPU, it should be the only group using the CPU's counters.
199In future, this will allow sophisticated monitoring programs to supply
200extra configuration information via 'extra_config_len' to exploit
201advanced features of the CPU's Performance Monitor Unit (PMU) that are
202not otherwise accessible and that might disrupt other hardware
203counters.
204
205The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
206way to request that counting of events be restricted to times when the
207CPU is in user, kernel and/or hypervisor mode.
208
209
210The 'pid' parameter to the perf_counter_open() system call allows the
211counter to be specific to a task:
212
213 pid == 0: if the pid parameter is zero, the counter is attached to the
214 current task.
215
216 pid > 0: the counter is attached to a specific task (if the current task
217 has sufficient privilege to do so)
218
219 pid < 0: all tasks are counted (per cpu counters)
220
221The 'cpu' parameter allows a counter to be made specific to a CPU:
222
223 cpu >= 0: the counter is restricted to a specific CPU
224 cpu == -1: the counter counts on all CPUs
225
226(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
227
228A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
229events of that task and 'follows' that task to whatever CPU the task
230gets schedule to. Per task counters can be created by any user, for
231their own tasks.
232
233A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
234all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
235
236The 'flags' parameter is currently unused and must be zero.
237
238The 'group_fd' parameter allows counter "groups" to be set up. A
239counter group has one counter which is the group "leader". The leader
240is created first, with group_fd = -1 in the perf_counter_open call
241that creates it. The rest of the group members are created
242subsequently, with group_fd giving the fd of the group leader.
243(A single counter on its own is created with group_fd = -1 and is
244considered to be a group with only 1 member.)
245
246A counter group is scheduled onto the CPU as a unit, that is, it will
247only be put onto the CPU if all of the counters in the group can be
248put onto the CPU. This means that the values of the member counters
249can be meaningfully compared, added, divided (to get ratios), etc.,
250with each other, since they have counted events for the same set of
251executed instructions.
252
253Counters can be enabled and disabled in two ways: via ioctl and via
254prctl. When a counter is disabled, it doesn't count or generate
255events but does continue to exist and maintain its count value.
256
257An individual counter or counter group can be enabled with
258
259 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
260
261or disabled with
262
263 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
264
265Enabling or disabling the leader of a group enables or disables the
266whole group; that is, while the group leader is disabled, none of the
267counters in the group will count. Enabling or disabling a member of a
268group other than the leader only affects that counter - disabling an
269non-leader stops that counter from counting but doesn't affect any
270other counter.
271
272A process can enable or disable all the counter groups that are
273attached to it, using prctl:
274
275 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
276
277 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
278
279This applies to all counters on the current process, whether created
280by this process or by another, and doesn't affect any counters that
281this process has created on other processes. It only enables or
282disables the group leaders, not any other members in the groups.
283
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
new file mode 100644
index 000000000000..15f3a5f90198
--- /dev/null
+++ b/Documentation/perf_counter/kerneltop.c
@@ -0,0 +1,1409 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#define _GNU_SOURCE
65#include <sys/types.h>
66#include <sys/stat.h>
67#include <sys/time.h>
68#include <unistd.h>
69#include <stdint.h>
70#include <stdlib.h>
71#include <string.h>
72#include <limits.h>
73#include <getopt.h>
74#include <assert.h>
75#include <fcntl.h>
76#include <stdio.h>
77#include <errno.h>
78#include <ctype.h>
79#include <time.h>
80#include <sched.h>
81#include <pthread.h>
82
83#include <sys/syscall.h>
84#include <sys/ioctl.h>
85#include <sys/poll.h>
86#include <sys/prctl.h>
87#include <sys/wait.h>
88#include <sys/uio.h>
89#include <sys/mman.h>
90
91#include <linux/unistd.h>
92#include <linux/types.h>
93
94#include "../../include/linux/perf_counter.h"
95
96
97/*
98 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
99 * counters in the current task.
100 */
101#define PR_TASK_PERF_COUNTERS_DISABLE 31
102#define PR_TASK_PERF_COUNTERS_ENABLE 32
103
104#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
105
106#define rdclock() \
107({ \
108 struct timespec ts; \
109 \
110 clock_gettime(CLOCK_MONOTONIC, &ts); \
111 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
112})
113
114/*
115 * Pick up some kernel type conventions:
116 */
117#define __user
118#define asmlinkage
119
120#ifdef __x86_64__
121#define __NR_perf_counter_open 295
122#define rmb() asm volatile("lfence" ::: "memory")
123#define cpu_relax() asm volatile("rep; nop" ::: "memory");
124#endif
125
126#ifdef __i386__
127#define __NR_perf_counter_open 333
128#define rmb() asm volatile("lfence" ::: "memory")
129#define cpu_relax() asm volatile("rep; nop" ::: "memory");
130#endif
131
132#ifdef __powerpc__
133#define __NR_perf_counter_open 319
134#define rmb() asm volatile ("sync" ::: "memory")
135#define cpu_relax() asm volatile ("" ::: "memory");
136#endif
137
138#define unlikely(x) __builtin_expect(!!(x), 0)
139#define min(x, y) ({ \
140 typeof(x) _min1 = (x); \
141 typeof(y) _min2 = (y); \
142 (void) (&_min1 == &_min2); \
143 _min1 < _min2 ? _min1 : _min2; })
144
145asmlinkage int sys_perf_counter_open(
146 struct perf_counter_hw_event *hw_event_uptr __user,
147 pid_t pid,
148 int cpu,
149 int group_fd,
150 unsigned long flags)
151{
152 return syscall(
153 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154}
155
156#define MAX_COUNTERS 64
157#define MAX_NR_CPUS 256
158
159#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
160
161static int run_perfstat = 0;
162static int system_wide = 0;
163
164static int nr_counters = 0;
165static __u64 event_id[MAX_COUNTERS] = {
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
168 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
169 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
170
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
173 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
174 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
175};
176static int default_interval = 100000;
177static int event_count[MAX_COUNTERS];
178static int fd[MAX_NR_CPUS][MAX_COUNTERS];
179
180static __u64 count_filter = 100;
181
182static int tid = -1;
183static int profile_cpu = -1;
184static int nr_cpus = 0;
185static int nmi = 1;
186static unsigned int realtime_prio = 0;
187static int group = 0;
188static unsigned int page_size;
189static unsigned int mmap_pages = 16;
190static int use_mmap = 0;
191static int use_munmap = 0;
192
193static char *vmlinux;
194
195static char *sym_filter;
196static unsigned long filter_start;
197static unsigned long filter_end;
198
199static int delay_secs = 2;
200static int zero;
201static int dump_symtab;
202
203static int scale;
204
205struct source_line {
206 uint64_t EIP;
207 unsigned long count;
208 char *line;
209 struct source_line *next;
210};
211
212static struct source_line *lines;
213static struct source_line **lines_tail;
214
215const unsigned int default_count[] = {
216 1000000,
217 1000000,
218 10000,
219 10000,
220 1000000,
221 10000,
222};
223
224static char *hw_event_names[] = {
225 "CPU cycles",
226 "instructions",
227 "cache references",
228 "cache misses",
229 "branches",
230 "branch misses",
231 "bus cycles",
232};
233
234static char *sw_event_names[] = {
235 "cpu clock ticks",
236 "task clock ticks",
237 "pagefaults",
238 "context switches",
239 "CPU migrations",
240 "minor faults",
241 "major faults",
242};
243
244struct event_symbol {
245 __u64 event;
246 char *symbol;
247};
248
249static struct event_symbol event_symbols[] = {
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
258 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
259
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
269 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
270};
271
272#define __PERF_COUNTER_FIELD(config, name) \
273 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
274
275#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
276#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
277#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
278#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
279
280static void display_events_help(void)
281{
282 unsigned int i;
283 __u64 e;
284
285 printf(
286 " -e EVENT --event=EVENT # symbolic-name abbreviations");
287
288 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
289 int type, id;
290
291 e = event_symbols[i].event;
292 type = PERF_COUNTER_TYPE(e);
293 id = PERF_COUNTER_ID(e);
294
295 printf("\n %d:%d: %-20s",
296 type, id, event_symbols[i].symbol);
297 }
298
299 printf("\n"
300 " rNNN: raw PMU events (eventsel+umask)\n\n");
301}
302
303static void display_perfstat_help(void)
304{
305 printf(
306 "Usage: perfstat [<events...>] <cmd...>\n\n"
307 "PerfStat Options (up to %d event types can be specified):\n\n",
308 MAX_COUNTERS);
309
310 display_events_help();
311
312 printf(
313 " -l # scale counter values\n"
314 " -a # system-wide collection\n");
315 exit(0);
316}
317
318static void display_help(void)
319{
320 if (run_perfstat)
321 return display_perfstat_help();
322
323 printf(
324 "Usage: kerneltop [<options>]\n"
325 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
326 "KernelTop Options (up to %d event types can be specified at once):\n\n",
327 MAX_COUNTERS);
328
329 display_events_help();
330
331 printf(
332 " -S --stat # perfstat COMMAND\n"
333 " -a # system-wide collection (for perfstat)\n\n"
334 " -c CNT --count=CNT # event period to sample\n\n"
335 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
336 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
337 " -l # show scale factor for RR events\n"
338 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
339 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
340 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
341 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
342 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
343 " -z --zero # zero counts after display\n"
344 " -D --dump_symtab # dump symbol table to stderr on startup\n"
345 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
346 " -M --mmap_info # print mmap info stream\n"
347 " -U --munmap_info # print munmap info stream\n"
348 );
349
350 exit(0);
351}
352
353static char *event_name(int ctr)
354{
355 __u64 config = event_id[ctr];
356 int type = PERF_COUNTER_TYPE(config);
357 int id = PERF_COUNTER_ID(config);
358 static char buf[32];
359
360 if (PERF_COUNTER_RAW(config)) {
361 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
362 return buf;
363 }
364
365 switch (type) {
366 case PERF_TYPE_HARDWARE:
367 if (id < PERF_HW_EVENTS_MAX)
368 return hw_event_names[id];
369 return "unknown-hardware";
370
371 case PERF_TYPE_SOFTWARE:
372 if (id < PERF_SW_EVENTS_MAX)
373 return sw_event_names[id];
374 return "unknown-software";
375
376 default:
377 break;
378 }
379
380 return "unknown";
381}
382
383/*
384 * Each event can have multiple symbolic names.
385 * Symbolic names are (almost) exactly matched.
386 */
387static __u64 match_event_symbols(char *str)
388{
389 __u64 config, id;
390 int type;
391 unsigned int i;
392
393 if (sscanf(str, "r%llx", &config) == 1)
394 return config | PERF_COUNTER_RAW_MASK;
395
396 if (sscanf(str, "%d:%llu", &type, &id) == 2)
397 return EID(type, id);
398
399 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
400 if (!strncmp(str, event_symbols[i].symbol,
401 strlen(event_symbols[i].symbol)))
402 return event_symbols[i].event;
403 }
404
405 return ~0ULL;
406}
407
408static int parse_events(char *str)
409{
410 __u64 config;
411
412again:
413 if (nr_counters == MAX_COUNTERS)
414 return -1;
415
416 config = match_event_symbols(str);
417 if (config == ~0ULL)
418 return -1;
419
420 event_id[nr_counters] = config;
421 nr_counters++;
422
423 str = strstr(str, ",");
424 if (str) {
425 str++;
426 goto again;
427 }
428
429 return 0;
430}
431
432
433/*
434 * perfstat
435 */
436
437char fault_here[1000000];
438
439static void create_perfstat_counter(int counter)
440{
441 struct perf_counter_hw_event hw_event;
442
443 memset(&hw_event, 0, sizeof(hw_event));
444 hw_event.config = event_id[counter];
445 hw_event.record_type = 0;
446 hw_event.nmi = 0;
447 if (scale)
448 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
449 PERF_FORMAT_TOTAL_TIME_RUNNING;
450
451 if (system_wide) {
452 int cpu;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
455 if (fd[cpu][counter] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd[cpu][counter], strerror(errno));
458 exit(-1);
459 }
460 }
461 } else {
462 hw_event.inherit = 1;
463 hw_event.disabled = 1;
464
465 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
466 if (fd[0][counter] < 0) {
467 printf("perfstat error: syscall returned with %d (%s)\n",
468 fd[0][counter], strerror(errno));
469 exit(-1);
470 }
471 }
472}
473
474int do_perfstat(int argc, char *argv[])
475{
476 unsigned long long t0, t1;
477 int counter;
478 ssize_t res;
479 int status;
480 int pid;
481
482 if (!system_wide)
483 nr_cpus = 1;
484
485 for (counter = 0; counter < nr_counters; counter++)
486 create_perfstat_counter(counter);
487
488 argc -= optind;
489 argv += optind;
490
491 if (!argc)
492 display_help();
493
494 /*
495 * Enable counters and exec the command:
496 */
497 t0 = rdclock();
498 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
499
500 if ((pid = fork()) < 0)
501 perror("failed to fork");
502 if (!pid) {
503 if (execvp(argv[0], argv)) {
504 perror(argv[0]);
505 exit(-1);
506 }
507 }
508 while (wait(&status) >= 0)
509 ;
510 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
511 t1 = rdclock();
512
513 fflush(stdout);
514
515 fprintf(stderr, "\n");
516 fprintf(stderr, " Performance counter stats for \'%s\':\n",
517 argv[0]);
518 fprintf(stderr, "\n");
519
520 for (counter = 0; counter < nr_counters; counter++) {
521 int cpu, nv;
522 __u64 count[3], single_count[3];
523 int scaled;
524
525 count[0] = count[1] = count[2] = 0;
526 nv = scale ? 3 : 1;
527 for (cpu = 0; cpu < nr_cpus; cpu ++) {
528 res = read(fd[cpu][counter],
529 single_count, nv * sizeof(__u64));
530 assert(res == nv * sizeof(__u64));
531
532 count[0] += single_count[0];
533 if (scale) {
534 count[1] += single_count[1];
535 count[2] += single_count[2];
536 }
537 }
538
539 scaled = 0;
540 if (scale) {
541 if (count[2] == 0) {
542 fprintf(stderr, " %14s %-20s\n",
543 "<not counted>", event_name(counter));
544 continue;
545 }
546 if (count[2] < count[1]) {
547 scaled = 1;
548 count[0] = (unsigned long long)
549 ((double)count[0] * count[1] / count[2] + 0.5);
550 }
551 }
552
553 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
554 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
555
556 double msecs = (double)count[0] / 1000000;
557
558 fprintf(stderr, " %14.6f %-20s (msecs)",
559 msecs, event_name(counter));
560 } else {
561 fprintf(stderr, " %14Ld %-20s (events)",
562 count[0], event_name(counter));
563 }
564 if (scaled)
565 fprintf(stderr, " (scaled from %.2f%%)",
566 (double) count[2] / count[1] * 100);
567 fprintf(stderr, "\n");
568 }
569 fprintf(stderr, "\n");
570 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
571 (double)(t1-t0)/1e6);
572 fprintf(stderr, "\n");
573
574 return 0;
575}
576
577/*
578 * Symbols
579 */
580
581static uint64_t min_ip;
582static uint64_t max_ip = -1ll;
583
584struct sym_entry {
585 unsigned long long addr;
586 char *sym;
587 unsigned long count[MAX_COUNTERS];
588 int skip;
589 struct source_line *source;
590};
591
592#define MAX_SYMS 100000
593
594static int sym_table_count;
595
596struct sym_entry *sym_filter_entry;
597
598static struct sym_entry sym_table[MAX_SYMS];
599
600static void show_details(struct sym_entry *sym);
601
602/*
603 * Ordering weight: count-1 * count-2 * ... / count-n
604 */
605static double sym_weight(const struct sym_entry *sym)
606{
607 double weight;
608 int counter;
609
610 weight = sym->count[0];
611
612 for (counter = 1; counter < nr_counters-1; counter++)
613 weight *= sym->count[counter];
614
615 weight /= (sym->count[counter] + 1);
616
617 return weight;
618}
619
620static int compare(const void *__sym1, const void *__sym2)
621{
622 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
623
624 return sym_weight(sym1) < sym_weight(sym2);
625}
626
627static long events;
628static long userspace_events;
629static const char CONSOLE_CLEAR[] = "";
630
631static struct sym_entry tmp[MAX_SYMS];
632
633static void print_sym_table(void)
634{
635 int i, printed;
636 int counter;
637 float events_per_sec = events/delay_secs;
638 float kevents_per_sec = (events-userspace_events)/delay_secs;
639 float sum_kevents = 0.0;
640
641 events = userspace_events = 0;
642 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
643 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
644
645 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
646 sum_kevents += tmp[i].count[0];
647
648 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
649
650 printf(
651"------------------------------------------------------------------------------\n");
652 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
653 events_per_sec,
654 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
655 nmi ? "NMI" : "IRQ");
656
657 if (nr_counters == 1)
658 printf("%d ", event_count[0]);
659
660 for (counter = 0; counter < nr_counters; counter++) {
661 if (counter)
662 printf("/");
663
664 printf("%s", event_name(counter));
665 }
666
667 printf( "], ");
668
669 if (tid != -1)
670 printf(" (tid: %d", tid);
671 else
672 printf(" (all");
673
674 if (profile_cpu != -1)
675 printf(", cpu: %d)\n", profile_cpu);
676 else {
677 if (tid != -1)
678 printf(")\n");
679 else
680 printf(", %d CPUs)\n", nr_cpus);
681 }
682
683 printf("------------------------------------------------------------------------------\n\n");
684
685 if (nr_counters == 1)
686 printf(" events pcnt");
687 else
688 printf(" weight events pcnt");
689
690 printf(" RIP kernel function\n"
691 " ______ ______ _____ ________________ _______________\n\n"
692 );
693
694 for (i = 0, printed = 0; i < sym_table_count; i++) {
695 float pcnt;
696 int count;
697
698 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
699 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
700
701 if (nr_counters == 1)
702 printf("%19.2f - %4.1f%% - %016llx : %s\n",
703 sym_weight(tmp + i),
704 pcnt, tmp[i].addr, tmp[i].sym);
705 else
706 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
707 sym_weight(tmp + i),
708 tmp[i].count[0],
709 pcnt, tmp[i].addr, tmp[i].sym);
710 printed++;
711 }
712 /*
713 * Add decay to the counts:
714 */
715 for (count = 0; count < nr_counters; count++)
716 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
717 }
718
719 if (sym_filter_entry)
720 show_details(sym_filter_entry);
721
722 {
723 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
724
725 if (poll(&stdin_poll, 1, 0) == 1) {
726 printf("key pressed - exiting.\n");
727 exit(0);
728 }
729 }
730}
731
732static void *display_thread(void *arg)
733{
734 printf("KernelTop refresh period: %d seconds\n", delay_secs);
735
736 while (!sleep(delay_secs))
737 print_sym_table();
738
739 return NULL;
740}
741
742static int read_symbol(FILE *in, struct sym_entry *s)
743{
744 static int filter_match = 0;
745 char *sym, stype;
746 char str[500];
747 int rc, pos;
748
749 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
750 if (rc == EOF)
751 return -1;
752
753 assert(rc == 3);
754
755 /* skip until end of line: */
756 pos = strlen(str);
757 do {
758 rc = fgetc(in);
759 if (rc == '\n' || rc == EOF || pos >= 499)
760 break;
761 str[pos] = rc;
762 pos++;
763 } while (1);
764 str[pos] = 0;
765
766 sym = str;
767
768 /* Filter out known duplicates and non-text symbols. */
769 if (!strcmp(sym, "_text"))
770 return 1;
771 if (!min_ip && !strcmp(sym, "_stext"))
772 return 1;
773 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
774 return 1;
775 if (stype != 'T' && stype != 't')
776 return 1;
777 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
778 return 1;
779 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
780 return 1;
781
782 s->sym = malloc(strlen(str));
783 assert(s->sym);
784
785 strcpy((char *)s->sym, str);
786 s->skip = 0;
787
788 /* Tag events to be skipped. */
789 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
790 s->skip = 1;
791 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
792 s->skip = 1;
793 else if (!strcmp("mwait_idle", s->sym))
794 s->skip = 1;
795
796 if (filter_match == 1) {
797 filter_end = s->addr;
798 filter_match = -1;
799 if (filter_end - filter_start > 10000) {
800 printf("hm, too large filter symbol <%s> - skipping.\n",
801 sym_filter);
802 printf("symbol filter start: %016lx\n", filter_start);
803 printf(" end: %016lx\n", filter_end);
804 filter_end = filter_start = 0;
805 sym_filter = NULL;
806 sleep(1);
807 }
808 }
809 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
810 filter_match = 1;
811 filter_start = s->addr;
812 }
813
814 return 0;
815}
816
817int compare_addr(const void *__sym1, const void *__sym2)
818{
819 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
820
821 return sym1->addr > sym2->addr;
822}
823
824static void sort_symbol_table(void)
825{
826 int i, dups;
827
828 do {
829 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
830 for (i = 0, dups = 0; i < sym_table_count; i++) {
831 if (sym_table[i].addr == sym_table[i+1].addr) {
832 sym_table[i+1].addr = -1ll;
833 dups++;
834 }
835 }
836 sym_table_count -= dups;
837 } while(dups);
838}
839
840static void parse_symbols(void)
841{
842 struct sym_entry *last;
843
844 FILE *kallsyms = fopen("/proc/kallsyms", "r");
845
846 if (!kallsyms) {
847 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
848 exit(-1);
849 }
850
851 while (!feof(kallsyms)) {
852 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
853 sym_table_count++;
854 assert(sym_table_count <= MAX_SYMS);
855 }
856 }
857
858 sort_symbol_table();
859 min_ip = sym_table[0].addr;
860 max_ip = sym_table[sym_table_count-1].addr;
861 last = sym_table + sym_table_count++;
862
863 last->addr = -1ll;
864 last->sym = "<end>";
865
866 if (filter_end) {
867 int count;
868 for (count=0; count < sym_table_count; count ++) {
869 if (!strcmp(sym_table[count].sym, sym_filter)) {
870 sym_filter_entry = &sym_table[count];
871 break;
872 }
873 }
874 }
875 if (dump_symtab) {
876 int i;
877
878 for (i = 0; i < sym_table_count; i++)
879 fprintf(stderr, "%llx %s\n",
880 sym_table[i].addr, sym_table[i].sym);
881 }
882}
883
884/*
885 * Source lines
886 */
887
888static void parse_vmlinux(char *filename)
889{
890 FILE *file;
891 char command[PATH_MAX*2];
892 if (!filename)
893 return;
894
895 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
896
897 file = popen(command, "r");
898 if (!file)
899 return;
900
901 lines_tail = &lines;
902 while (!feof(file)) {
903 struct source_line *src;
904 size_t dummy = 0;
905 char *c;
906
907 src = malloc(sizeof(struct source_line));
908 assert(src != NULL);
909 memset(src, 0, sizeof(struct source_line));
910
911 if (getline(&src->line, &dummy, file) < 0)
912 break;
913 if (!src->line)
914 break;
915
916 c = strchr(src->line, '\n');
917 if (c)
918 *c = 0;
919
920 src->next = NULL;
921 *lines_tail = src;
922 lines_tail = &src->next;
923
924 if (strlen(src->line)>8 && src->line[8] == ':')
925 src->EIP = strtoull(src->line, NULL, 16);
926 if (strlen(src->line)>8 && src->line[16] == ':')
927 src->EIP = strtoull(src->line, NULL, 16);
928 }
929 pclose(file);
930}
931
932static void record_precise_ip(uint64_t ip)
933{
934 struct source_line *line;
935
936 for (line = lines; line; line = line->next) {
937 if (line->EIP == ip)
938 line->count++;
939 if (line->EIP > ip)
940 break;
941 }
942}
943
944static void lookup_sym_in_vmlinux(struct sym_entry *sym)
945{
946 struct source_line *line;
947 char pattern[PATH_MAX];
948 sprintf(pattern, "<%s>:", sym->sym);
949
950 for (line = lines; line; line = line->next) {
951 if (strstr(line->line, pattern)) {
952 sym->source = line;
953 break;
954 }
955 }
956}
957
958static void show_lines(struct source_line *line_queue, int line_queue_count)
959{
960 int i;
961 struct source_line *line;
962
963 line = line_queue;
964 for (i = 0; i < line_queue_count; i++) {
965 printf("%8li\t%s\n", line->count, line->line);
966 line = line->next;
967 }
968}
969
970#define TRACE_COUNT 3
971
972static void show_details(struct sym_entry *sym)
973{
974 struct source_line *line;
975 struct source_line *line_queue = NULL;
976 int displayed = 0;
977 int line_queue_count = 0;
978
979 if (!sym->source)
980 lookup_sym_in_vmlinux(sym);
981 if (!sym->source)
982 return;
983
984 printf("Showing details for %s\n", sym->sym);
985
986 line = sym->source;
987 while (line) {
988 if (displayed && strstr(line->line, ">:"))
989 break;
990
991 if (!line_queue_count)
992 line_queue = line;
993 line_queue_count ++;
994
995 if (line->count >= count_filter) {
996 show_lines(line_queue, line_queue_count);
997 line_queue_count = 0;
998 line_queue = NULL;
999 } else if (line_queue_count > TRACE_COUNT) {
1000 line_queue = line_queue->next;
1001 line_queue_count --;
1002 }
1003
1004 line->count = 0;
1005 displayed++;
1006 if (displayed > 300)
1007 break;
1008 line = line->next;
1009 }
1010}
1011
1012/*
1013 * Binary search in the histogram table and record the hit:
1014 */
1015static void record_ip(uint64_t ip, int counter)
1016{
1017 int left_idx, middle_idx, right_idx, idx;
1018 unsigned long left, middle, right;
1019
1020 record_precise_ip(ip);
1021
1022 left_idx = 0;
1023 right_idx = sym_table_count-1;
1024 assert(ip <= max_ip && ip >= min_ip);
1025
1026 while (left_idx + 1 < right_idx) {
1027 middle_idx = (left_idx + right_idx) / 2;
1028
1029 left = sym_table[ left_idx].addr;
1030 middle = sym_table[middle_idx].addr;
1031 right = sym_table[ right_idx].addr;
1032
1033 if (!(left <= middle && middle <= right)) {
1034 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1035 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1036 }
1037 assert(left <= middle && middle <= right);
1038 if (!(left <= ip && ip <= right)) {
1039 printf(" left: %016lx\n", left);
1040 printf(" ip: %016lx\n", (unsigned long)ip);
1041 printf("right: %016lx\n", right);
1042 }
1043 assert(left <= ip && ip <= right);
1044 /*
1045 * [ left .... target .... middle .... right ]
1046 * => right := middle
1047 */
1048 if (ip < middle) {
1049 right_idx = middle_idx;
1050 continue;
1051 }
1052 /*
1053 * [ left .... middle ... target ... right ]
1054 * => left := middle
1055 */
1056 left_idx = middle_idx;
1057 }
1058
1059 idx = left_idx;
1060
1061 if (!sym_table[idx].skip)
1062 sym_table[idx].count[counter]++;
1063 else events--;
1064}
1065
1066static void process_event(uint64_t ip, int counter)
1067{
1068 events++;
1069
1070 if (ip < min_ip || ip > max_ip) {
1071 userspace_events++;
1072 return;
1073 }
1074
1075 record_ip(ip, counter);
1076}
1077
1078static void process_options(int argc, char *argv[])
1079{
1080 int error = 0, counter;
1081
1082 if (strstr(argv[0], "perfstat"))
1083 run_perfstat = 1;
1084
1085 for (;;) {
1086 int option_index = 0;
1087 /** Options for getopt */
1088 static struct option long_options[] = {
1089 {"count", required_argument, NULL, 'c'},
1090 {"cpu", required_argument, NULL, 'C'},
1091 {"delay", required_argument, NULL, 'd'},
1092 {"dump_symtab", no_argument, NULL, 'D'},
1093 {"event", required_argument, NULL, 'e'},
1094 {"filter", required_argument, NULL, 'f'},
1095 {"group", required_argument, NULL, 'g'},
1096 {"help", no_argument, NULL, 'h'},
1097 {"nmi", required_argument, NULL, 'n'},
1098 {"mmap_info", no_argument, NULL, 'M'},
1099 {"mmap_pages", required_argument, NULL, 'm'},
1100 {"munmap_info", no_argument, NULL, 'U'},
1101 {"pid", required_argument, NULL, 'p'},
1102 {"realtime", required_argument, NULL, 'r'},
1103 {"scale", no_argument, NULL, 'l'},
1104 {"symbol", required_argument, NULL, 's'},
1105 {"stat", no_argument, NULL, 'S'},
1106 {"vmlinux", required_argument, NULL, 'x'},
1107 {"zero", no_argument, NULL, 'z'},
1108 {NULL, 0, NULL, 0 }
1109 };
1110 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1111 long_options, &option_index);
1112 if (c == -1)
1113 break;
1114
1115 switch (c) {
1116 case 'a': system_wide = 1; break;
1117 case 'c': default_interval = atoi(optarg); break;
1118 case 'C':
1119 /* CPU and PID are mutually exclusive */
1120 if (tid != -1) {
1121 printf("WARNING: CPU switch overriding PID\n");
1122 sleep(1);
1123 tid = -1;
1124 }
1125 profile_cpu = atoi(optarg); break;
1126 case 'd': delay_secs = atoi(optarg); break;
1127 case 'D': dump_symtab = 1; break;
1128
1129 case 'e': error = parse_events(optarg); break;
1130
1131 case 'f': count_filter = atoi(optarg); break;
1132 case 'g': group = atoi(optarg); break;
1133 case 'h': display_help(); break;
1134 case 'l': scale = 1; break;
1135 case 'n': nmi = atoi(optarg); break;
1136 case 'p':
1137 /* CPU and PID are mutually exclusive */
1138 if (profile_cpu != -1) {
1139 printf("WARNING: PID switch overriding CPU\n");
1140 sleep(1);
1141 profile_cpu = -1;
1142 }
1143 tid = atoi(optarg); break;
1144 case 'r': realtime_prio = atoi(optarg); break;
1145 case 's': sym_filter = strdup(optarg); break;
1146 case 'S': run_perfstat = 1; break;
1147 case 'x': vmlinux = strdup(optarg); break;
1148 case 'z': zero = 1; break;
1149 case 'm': mmap_pages = atoi(optarg); break;
1150 case 'M': use_mmap = 1; break;
1151 case 'U': use_munmap = 1; break;
1152 default: error = 1; break;
1153 }
1154 }
1155 if (error)
1156 display_help();
1157
1158 if (!nr_counters) {
1159 if (run_perfstat)
1160 nr_counters = 8;
1161 else {
1162 nr_counters = 1;
1163 event_id[0] = 0;
1164 }
1165 }
1166
1167 for (counter = 0; counter < nr_counters; counter++) {
1168 if (event_count[counter])
1169 continue;
1170
1171 event_count[counter] = default_interval;
1172 }
1173}
1174
1175struct mmap_data {
1176 int counter;
1177 void *base;
1178 unsigned int mask;
1179 unsigned int prev;
1180};
1181
1182static unsigned int mmap_read_head(struct mmap_data *md)
1183{
1184 struct perf_counter_mmap_page *pc = md->base;
1185 int head;
1186
1187 head = pc->data_head;
1188 rmb();
1189
1190 return head;
1191}
1192
1193struct timeval last_read, this_read;
1194
1195static void mmap_read(struct mmap_data *md)
1196{
1197 unsigned int head = mmap_read_head(md);
1198 unsigned int old = md->prev;
1199 unsigned char *data = md->base + page_size;
1200 int diff;
1201
1202 gettimeofday(&this_read, NULL);
1203
1204 /*
1205 * If we're further behind than half the buffer, there's a chance
1206 * the writer will bite our tail and screw up the events under us.
1207 *
1208 * If we somehow ended up ahead of the head, we got messed up.
1209 *
1210 * In either case, truncate and restart at head.
1211 */
1212 diff = head - old;
1213 if (diff > md->mask / 2 || diff < 0) {
1214 struct timeval iv;
1215 unsigned long msecs;
1216
1217 timersub(&this_read, &last_read, &iv);
1218 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1219
1220 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1221 " Last read %lu msecs ago.\n", msecs);
1222
1223 /*
1224 * head points to a known good entry, start there.
1225 */
1226 old = head;
1227 }
1228
1229 last_read = this_read;
1230
1231 for (; old != head;) {
1232 struct ip_event {
1233 struct perf_event_header header;
1234 __u64 ip;
1235 __u32 pid, tid;
1236 };
1237 struct mmap_event {
1238 struct perf_event_header header;
1239 __u32 pid, tid;
1240 __u64 start;
1241 __u64 len;
1242 __u64 pgoff;
1243 char filename[PATH_MAX];
1244 };
1245
1246 typedef union event_union {
1247 struct perf_event_header header;
1248 struct ip_event ip;
1249 struct mmap_event mmap;
1250 } event_t;
1251
1252 event_t *event = (event_t *)&data[old & md->mask];
1253
1254 event_t event_copy;
1255
1256 unsigned int size = event->header.size;
1257
1258 /*
1259 * Event straddles the mmap boundary -- header should always
1260 * be inside due to u64 alignment of output.
1261 */
1262 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1263 unsigned int offset = old;
1264 unsigned int len = min(sizeof(*event), size), cpy;
1265 void *dst = &event_copy;
1266
1267 do {
1268 cpy = min(md->mask + 1 - (offset & md->mask), len);
1269 memcpy(dst, &data[offset & md->mask], cpy);
1270 offset += cpy;
1271 dst += cpy;
1272 len -= cpy;
1273 } while (len);
1274
1275 event = &event_copy;
1276 }
1277
1278 old += size;
1279
1280 switch (event->header.type) {
1281 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
1282 case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
1283 process_event(event->ip.ip, md->counter);
1284 break;
1285
1286 case PERF_EVENT_MMAP:
1287 case PERF_EVENT_MUNMAP:
1288 printf("%s: %Lu %Lu %Lu %s\n",
1289 event->header.type == PERF_EVENT_MMAP
1290 ? "mmap" : "munmap",
1291 event->mmap.start,
1292 event->mmap.len,
1293 event->mmap.pgoff,
1294 event->mmap.filename);
1295 break;
1296 }
1297 }
1298
1299 md->prev = old;
1300}
1301
1302int main(int argc, char *argv[])
1303{
1304 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1305 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1306 struct perf_counter_hw_event hw_event;
1307 pthread_t thread;
1308 int i, counter, group_fd, nr_poll = 0;
1309 unsigned int cpu;
1310 int ret;
1311
1312 page_size = sysconf(_SC_PAGE_SIZE);
1313
1314 process_options(argc, argv);
1315
1316 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1317 assert(nr_cpus <= MAX_NR_CPUS);
1318 assert(nr_cpus >= 0);
1319
1320 if (run_perfstat)
1321 return do_perfstat(argc, argv);
1322
1323 if (tid != -1 || profile_cpu != -1)
1324 nr_cpus = 1;
1325
1326 parse_symbols();
1327 if (vmlinux && sym_filter_entry)
1328 parse_vmlinux(vmlinux);
1329
1330 for (i = 0; i < nr_cpus; i++) {
1331 group_fd = -1;
1332 for (counter = 0; counter < nr_counters; counter++) {
1333
1334 cpu = profile_cpu;
1335 if (tid == -1 && profile_cpu == -1)
1336 cpu = i;
1337
1338 memset(&hw_event, 0, sizeof(hw_event));
1339 hw_event.config = event_id[counter];
1340 hw_event.irq_period = event_count[counter];
1341 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1342 hw_event.nmi = nmi;
1343 hw_event.mmap = use_mmap;
1344 hw_event.munmap = use_munmap;
1345
1346 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1347 if (fd[i][counter] < 0) {
1348 int err = errno;
1349 printf("kerneltop error: syscall returned with %d (%s)\n",
1350 fd[i][counter], strerror(err));
1351 if (err == EPERM)
1352 printf("Are you root?\n");
1353 exit(-1);
1354 }
1355 assert(fd[i][counter] >= 0);
1356 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1357
1358 /*
1359 * First counter acts as the group leader:
1360 */
1361 if (group && group_fd == -1)
1362 group_fd = fd[i][counter];
1363
1364 event_array[nr_poll].fd = fd[i][counter];
1365 event_array[nr_poll].events = POLLIN;
1366 nr_poll++;
1367
1368 mmap_array[i][counter].counter = counter;
1369 mmap_array[i][counter].prev = 0;
1370 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1371 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1372 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1373 if (mmap_array[i][counter].base == MAP_FAILED) {
1374 printf("kerneltop error: failed to mmap with %d (%s)\n",
1375 errno, strerror(errno));
1376 exit(-1);
1377 }
1378 }
1379 }
1380
1381 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1382 printf("Could not create display thread.\n");
1383 exit(-1);
1384 }
1385
1386 if (realtime_prio) {
1387 struct sched_param param;
1388
1389 param.sched_priority = realtime_prio;
1390 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1391 printf("Could not set realtime priority.\n");
1392 exit(-1);
1393 }
1394 }
1395
1396 while (1) {
1397 int hits = events;
1398
1399 for (i = 0; i < nr_cpus; i++) {
1400 for (counter = 0; counter < nr_counters; counter++)
1401 mmap_read(&mmap_array[i][counter]);
1402 }
1403
1404 if (hits == events)
1405 ret = poll(event_array, nr_poll, 100);
1406 }
1407
1408 return 0;
1409}
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6dd..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long test_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(void)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (1),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152static inline void clear_perf_counter_pending(void)
153{
154 asm volatile("stb %0,%1(13)" : :
155 "r" (0),
156 "i" (offsetof(struct paca_struct, perf_counter_pending)));
157}
158
159extern void perf_counter_do_pending(void);
160
161#else
162
163static inline unsigned long test_perf_counter_pending(void)
164{
165 return 0;
166}
167
168static inline void set_perf_counter_pending(void) {}
169static inline void clear_perf_counter_pending(void) {}
170static inline void perf_counter_do_pending(void) {}
171#endif /* CONFIG_PERF_COUNTERS */
172
134#endif /* __KERNEL__ */ 173#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 174#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..9d7ff6d7fb56
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,72 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15
16/*
17 * This struct provides the constants and functions needed to
18 * describe the PMU on a particular POWER-family CPU.
19 */
20struct power_pmu {
21 int n_counter;
22 int max_alternatives;
23 u64 add_fields;
24 u64 test_adder;
25 int (*compute_mmcr)(unsigned int events[], int n_ev,
26 unsigned int hwc[], u64 mmcr[]);
27 int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
28 int (*get_alternatives)(unsigned int event, unsigned int alt[]);
29 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
30 int n_generic;
31 int *generic_events;
32};
33
34extern struct power_pmu *ppmu;
35
36/*
37 * The power_pmu.get_constraint function returns a 64-bit value and
38 * a 64-bit mask that express the constraints between this event and
39 * other events.
40 *
41 * The value and mask are divided up into (non-overlapping) bitfields
42 * of three different types:
43 *
44 * Select field: this expresses the constraint that some set of bits
45 * in MMCR* needs to be set to a specific value for this event. For a
46 * select field, the mask contains 1s in every bit of the field, and
47 * the value contains a unique value for each possible setting of the
48 * MMCR* bits. The constraint checking code will ensure that two events
49 * that set the same field in their masks have the same value in their
50 * value dwords.
51 *
52 * Add field: this expresses the constraint that there can be at most
53 * N events in a particular class. A field of k bits can be used for
54 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
55 * set (and the other bits 0), and the value has only the least significant
56 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
57 * in the struct power_pmu for this processor come into play. The
58 * add_fields value contains 1 in the LSB of the field, and the
59 * test_adder contains 2^(k-1) - 1 - N in the field.
60 *
61 * NAND field: this expresses the constraint that you may not have events
62 * in all of a set of classes. (For example, on PPC970, you can't select
63 * events from the FPU, ISU and IDU simultaneously, although any two are
64 * possible.) For N classes, the field is N+1 bits wide, and each class
65 * is assigned one bit from the least-significant N bits. The mask has
66 * only the most-significant bit set, and the value has only the bit
67 * for the event's class set. The test_adder has the least significant
68 * bit set in the field.
69 *
70 * If an event is not subject to the constraint expressed by a particular
71 * field, then it will have 0 in both the mask and value for that field.
72 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index fe166491e9dc..affa8caed7eb 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL_SPU(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c76ed77..7cef5afe89d8 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344 345
345#ifdef __KERNEL__ 346#ifdef __KERNEL__
346 347
347#define __NR_syscalls 319 348#define __NR_syscalls 320
348 349
349#define __NR__exit __NR_exit 350#define __NR__exit __NR_exit
350#define NR_syscalls __NR_syscalls 351#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a5..9ba1bb731fcc 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
94 94
95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
98 power5-pmu.o power5+-pmu.o power6-pmu.o
97 99
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 100obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 101
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc053946..e981d1ce1914 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc32330479..43e073477c34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5576147e57b6..2cd471f92fe6 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending();
140 perf_counter_do_pending();
141 }
142
138 /* 143 /*
139 * if (get_paca()->hard_enabled) return; 144 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 145 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..f88c35d0710a
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,846 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20
21struct cpu_hw_counters {
22 int n_counters;
23 int n_percpu;
24 int disabled;
25 int n_added;
26 struct perf_counter *counter[MAX_HWCOUNTERS];
27 unsigned int events[MAX_HWCOUNTERS];
28 u64 mmcr[3];
29 u8 pmcs_enabled;
30};
31DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
32
33struct power_pmu *ppmu;
34
35/*
36 * Normally, to ignore kernel events we set the FCS (freeze counters
37 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
38 * hypervisor bit set in the MSR, or if we are running on a processor
39 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
40 * then we need to use the FCHV bit to ignore kernel events.
41 */
42static unsigned int freeze_counters_kernel = MMCR0_FCS;
43
44static void perf_counter_interrupt(struct pt_regs *regs);
45
46void perf_counter_print_debug(void)
47{
48}
49
50/*
51 * Read one performance monitor counter (PMC).
52 */
53static unsigned long read_pmc(int idx)
54{
55 unsigned long val;
56
57 switch (idx) {
58 case 1:
59 val = mfspr(SPRN_PMC1);
60 break;
61 case 2:
62 val = mfspr(SPRN_PMC2);
63 break;
64 case 3:
65 val = mfspr(SPRN_PMC3);
66 break;
67 case 4:
68 val = mfspr(SPRN_PMC4);
69 break;
70 case 5:
71 val = mfspr(SPRN_PMC5);
72 break;
73 case 6:
74 val = mfspr(SPRN_PMC6);
75 break;
76 case 7:
77 val = mfspr(SPRN_PMC7);
78 break;
79 case 8:
80 val = mfspr(SPRN_PMC8);
81 break;
82 default:
83 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
84 val = 0;
85 }
86 return val;
87}
88
89/*
90 * Write one PMC.
91 */
92static void write_pmc(int idx, unsigned long val)
93{
94 switch (idx) {
95 case 1:
96 mtspr(SPRN_PMC1, val);
97 break;
98 case 2:
99 mtspr(SPRN_PMC2, val);
100 break;
101 case 3:
102 mtspr(SPRN_PMC3, val);
103 break;
104 case 4:
105 mtspr(SPRN_PMC4, val);
106 break;
107 case 5:
108 mtspr(SPRN_PMC5, val);
109 break;
110 case 6:
111 mtspr(SPRN_PMC6, val);
112 break;
113 case 7:
114 mtspr(SPRN_PMC7, val);
115 break;
116 case 8:
117 mtspr(SPRN_PMC8, val);
118 break;
119 default:
120 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
121 }
122}
123
124/*
125 * Check if a set of events can all go on the PMU at once.
126 * If they can't, this will look at alternative codes for the events
127 * and see if any combination of alternative codes is feasible.
128 * The feasible set is returned in event[].
129 */
130static int power_check_constraints(unsigned int event[], int n_ev)
131{
132 u64 mask, value, nv;
133 unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
134 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
135 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
136 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
137 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
138 int i, j;
139 u64 addf = ppmu->add_fields;
140 u64 tadd = ppmu->test_adder;
141
142 if (n_ev > ppmu->n_counter)
143 return -1;
144
145 /* First see if the events will go on as-is */
146 for (i = 0; i < n_ev; ++i) {
147 alternatives[i][0] = event[i];
148 if (ppmu->get_constraint(event[i], &amasks[i][0],
149 &avalues[i][0]))
150 return -1;
151 choice[i] = 0;
152 }
153 value = mask = 0;
154 for (i = 0; i < n_ev; ++i) {
155 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
156 if ((((nv + tadd) ^ value) & mask) != 0 ||
157 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
158 break;
159 value = nv;
160 mask |= amasks[i][0];
161 }
162 if (i == n_ev)
163 return 0; /* all OK */
164
165 /* doesn't work, gather alternatives... */
166 if (!ppmu->get_alternatives)
167 return -1;
168 for (i = 0; i < n_ev; ++i) {
169 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
170 for (j = 1; j < n_alt[i]; ++j)
171 ppmu->get_constraint(alternatives[i][j],
172 &amasks[i][j], &avalues[i][j]);
173 }
174
175 /* enumerate all possibilities and see if any will work */
176 i = 0;
177 j = -1;
178 value = mask = nv = 0;
179 while (i < n_ev) {
180 if (j >= 0) {
181 /* we're backtracking, restore context */
182 value = svalues[i];
183 mask = smasks[i];
184 j = choice[i];
185 }
186 /*
187 * See if any alternative k for event i,
188 * where k > j, will satisfy the constraints.
189 */
190 while (++j < n_alt[i]) {
191 nv = (value | avalues[i][j]) +
192 (value & avalues[i][j] & addf);
193 if ((((nv + tadd) ^ value) & mask) == 0 &&
194 (((nv + tadd) ^ avalues[i][j])
195 & amasks[i][j]) == 0)
196 break;
197 }
198 if (j >= n_alt[i]) {
199 /*
200 * No feasible alternative, backtrack
201 * to event i-1 and continue enumerating its
202 * alternatives from where we got up to.
203 */
204 if (--i < 0)
205 return -1;
206 } else {
207 /*
208 * Found a feasible alternative for event i,
209 * remember where we got up to with this event,
210 * go on to the next event, and start with
211 * the first alternative for it.
212 */
213 choice[i] = j;
214 svalues[i] = value;
215 smasks[i] = mask;
216 value = nv;
217 mask |= amasks[i][j];
218 ++i;
219 j = -1;
220 }
221 }
222
223 /* OK, we have a feasible combination, tell the caller the solution */
224 for (i = 0; i < n_ev; ++i)
225 event[i] = alternatives[i][choice[i]];
226 return 0;
227}
228
229/*
230 * Check if newly-added counters have consistent settings for
231 * exclude_{user,kernel,hv} with each other and any previously
232 * added counters.
233 */
234static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
235{
236 int eu, ek, eh;
237 int i, n;
238 struct perf_counter *counter;
239
240 n = n_prev + n_new;
241 if (n <= 1)
242 return 0;
243
244 eu = ctrs[0]->hw_event.exclude_user;
245 ek = ctrs[0]->hw_event.exclude_kernel;
246 eh = ctrs[0]->hw_event.exclude_hv;
247 if (n_prev == 0)
248 n_prev = 1;
249 for (i = n_prev; i < n; ++i) {
250 counter = ctrs[i];
251 if (counter->hw_event.exclude_user != eu ||
252 counter->hw_event.exclude_kernel != ek ||
253 counter->hw_event.exclude_hv != eh)
254 return -EAGAIN;
255 }
256 return 0;
257}
258
259static void power_perf_read(struct perf_counter *counter)
260{
261 long val, delta, prev;
262
263 if (!counter->hw.idx)
264 return;
265 /*
266 * Performance monitor interrupts come even when interrupts
267 * are soft-disabled, as long as interrupts are hard-enabled.
268 * Therefore we treat them like NMIs.
269 */
270 do {
271 prev = atomic64_read(&counter->hw.prev_count);
272 barrier();
273 val = read_pmc(counter->hw.idx);
274 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
275
276 /* The counters are only 32 bits wide */
277 delta = (val - prev) & 0xfffffffful;
278 atomic64_add(delta, &counter->count);
279 atomic64_sub(delta, &counter->hw.period_left);
280}
281
282/*
283 * Disable all counters to prevent PMU interrupts and to allow
284 * counters to be added or removed.
285 */
286u64 hw_perf_save_disable(void)
287{
288 struct cpu_hw_counters *cpuhw;
289 unsigned long ret;
290 unsigned long flags;
291
292 local_irq_save(flags);
293 cpuhw = &__get_cpu_var(cpu_hw_counters);
294
295 ret = cpuhw->disabled;
296 if (!ret) {
297 cpuhw->disabled = 1;
298 cpuhw->n_added = 0;
299
300 /*
301 * Check if we ever enabled the PMU on this cpu.
302 */
303 if (!cpuhw->pmcs_enabled) {
304 if (ppc_md.enable_pmcs)
305 ppc_md.enable_pmcs();
306 cpuhw->pmcs_enabled = 1;
307 }
308
309 /*
310 * Set the 'freeze counters' bit.
311 * The barrier is to make sure the mtspr has been
312 * executed and the PMU has frozen the counters
313 * before we return.
314 */
315 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
316 mb();
317 }
318 local_irq_restore(flags);
319 return ret;
320}
321
322/*
323 * Re-enable all counters if disable == 0.
324 * If we were previously disabled and counters were added, then
325 * put the new config on the PMU.
326 */
327void hw_perf_restore(u64 disable)
328{
329 struct perf_counter *counter;
330 struct cpu_hw_counters *cpuhw;
331 unsigned long flags;
332 long i;
333 unsigned long val;
334 s64 left;
335 unsigned int hwc_index[MAX_HWCOUNTERS];
336
337 if (disable)
338 return;
339 local_irq_save(flags);
340 cpuhw = &__get_cpu_var(cpu_hw_counters);
341 cpuhw->disabled = 0;
342
343 /*
344 * If we didn't change anything, or only removed counters,
345 * no need to recalculate MMCR* settings and reset the PMCs.
346 * Just reenable the PMU with the current MMCR* settings
347 * (possibly updated for removal of counters).
348 */
349 if (!cpuhw->n_added) {
350 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
351 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
352 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
353 if (cpuhw->n_counters == 0)
354 get_lppaca()->pmcregs_in_use = 0;
355 goto out;
356 }
357
358 /*
359 * Compute MMCR* values for the new set of counters
360 */
361 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
362 cpuhw->mmcr)) {
363 /* shouldn't ever get here */
364 printk(KERN_ERR "oops compute_mmcr failed\n");
365 goto out;
366 }
367
368 /*
369 * Add in MMCR0 freeze bits corresponding to the
370 * hw_event.exclude_* bits for the first counter.
371 * We have already checked that all counters have the
372 * same values for these bits as the first counter.
373 */
374 counter = cpuhw->counter[0];
375 if (counter->hw_event.exclude_user)
376 cpuhw->mmcr[0] |= MMCR0_FCP;
377 if (counter->hw_event.exclude_kernel)
378 cpuhw->mmcr[0] |= freeze_counters_kernel;
379 if (counter->hw_event.exclude_hv)
380 cpuhw->mmcr[0] |= MMCR0_FCHV;
381
382 /*
383 * Write the new configuration to MMCR* with the freeze
384 * bit set and set the hardware counters to their initial values.
385 * Then unfreeze the counters.
386 */
387 get_lppaca()->pmcregs_in_use = 1;
388 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
389 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
390 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
391 | MMCR0_FC);
392
393 /*
394 * Read off any pre-existing counters that need to move
395 * to another PMC.
396 */
397 for (i = 0; i < cpuhw->n_counters; ++i) {
398 counter = cpuhw->counter[i];
399 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
400 power_perf_read(counter);
401 write_pmc(counter->hw.idx, 0);
402 counter->hw.idx = 0;
403 }
404 }
405
406 /*
407 * Initialize the PMCs for all the new and moved counters.
408 */
409 for (i = 0; i < cpuhw->n_counters; ++i) {
410 counter = cpuhw->counter[i];
411 if (counter->hw.idx)
412 continue;
413 val = 0;
414 if (counter->hw_event.irq_period) {
415 left = atomic64_read(&counter->hw.period_left);
416 if (left < 0x80000000L)
417 val = 0x80000000L - left;
418 }
419 atomic64_set(&counter->hw.prev_count, val);
420 counter->hw.idx = hwc_index[i] + 1;
421 write_pmc(counter->hw.idx, val);
422 perf_counter_update_userpage(counter);
423 }
424 mb();
425 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
426 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
427
428 out:
429 local_irq_restore(flags);
430}
431
432static int collect_events(struct perf_counter *group, int max_count,
433 struct perf_counter *ctrs[], unsigned int *events)
434{
435 int n = 0;
436 struct perf_counter *counter;
437
438 if (!is_software_counter(group)) {
439 if (n >= max_count)
440 return -1;
441 ctrs[n] = group;
442 events[n++] = group->hw.config;
443 }
444 list_for_each_entry(counter, &group->sibling_list, list_entry) {
445 if (!is_software_counter(counter) &&
446 counter->state != PERF_COUNTER_STATE_OFF) {
447 if (n >= max_count)
448 return -1;
449 ctrs[n] = counter;
450 events[n++] = counter->hw.config;
451 }
452 }
453 return n;
454}
455
456static void counter_sched_in(struct perf_counter *counter, int cpu)
457{
458 counter->state = PERF_COUNTER_STATE_ACTIVE;
459 counter->oncpu = cpu;
460 counter->tstamp_running += counter->ctx->time_now -
461 counter->tstamp_stopped;
462 if (is_software_counter(counter))
463 counter->hw_ops->enable(counter);
464}
465
466/*
467 * Called to enable a whole group of counters.
468 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
469 * Assumes the caller has disabled interrupts and has
470 * frozen the PMU with hw_perf_save_disable.
471 */
472int hw_perf_group_sched_in(struct perf_counter *group_leader,
473 struct perf_cpu_context *cpuctx,
474 struct perf_counter_context *ctx, int cpu)
475{
476 struct cpu_hw_counters *cpuhw;
477 long i, n, n0;
478 struct perf_counter *sub;
479
480 cpuhw = &__get_cpu_var(cpu_hw_counters);
481 n0 = cpuhw->n_counters;
482 n = collect_events(group_leader, ppmu->n_counter - n0,
483 &cpuhw->counter[n0], &cpuhw->events[n0]);
484 if (n < 0)
485 return -EAGAIN;
486 if (check_excludes(cpuhw->counter, n0, n))
487 return -EAGAIN;
488 if (power_check_constraints(cpuhw->events, n + n0))
489 return -EAGAIN;
490 cpuhw->n_counters = n0 + n;
491 cpuhw->n_added += n;
492
493 /*
494 * OK, this group can go on; update counter states etc.,
495 * and enable any software counters
496 */
497 for (i = n0; i < n0 + n; ++i)
498 cpuhw->counter[i]->hw.config = cpuhw->events[i];
499 cpuctx->active_oncpu += n;
500 n = 1;
501 counter_sched_in(group_leader, cpu);
502 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
503 if (sub->state != PERF_COUNTER_STATE_OFF) {
504 counter_sched_in(sub, cpu);
505 ++n;
506 }
507 }
508 ctx->nr_active += n;
509
510 return 1;
511}
512
513/*
514 * Add a counter to the PMU.
515 * If all counters are not already frozen, then we disable and
516 * re-enable the PMU in order to get hw_perf_restore to do the
517 * actual work of reconfiguring the PMU.
518 */
519static int power_perf_enable(struct perf_counter *counter)
520{
521 struct cpu_hw_counters *cpuhw;
522 unsigned long flags;
523 u64 pmudis;
524 int n0;
525 int ret = -EAGAIN;
526
527 local_irq_save(flags);
528 pmudis = hw_perf_save_disable();
529
530 /*
531 * Add the counter to the list (if there is room)
532 * and check whether the total set is still feasible.
533 */
534 cpuhw = &__get_cpu_var(cpu_hw_counters);
535 n0 = cpuhw->n_counters;
536 if (n0 >= ppmu->n_counter)
537 goto out;
538 cpuhw->counter[n0] = counter;
539 cpuhw->events[n0] = counter->hw.config;
540 if (check_excludes(cpuhw->counter, n0, 1))
541 goto out;
542 if (power_check_constraints(cpuhw->events, n0 + 1))
543 goto out;
544
545 counter->hw.config = cpuhw->events[n0];
546 ++cpuhw->n_counters;
547 ++cpuhw->n_added;
548
549 ret = 0;
550 out:
551 hw_perf_restore(pmudis);
552 local_irq_restore(flags);
553 return ret;
554}
555
556/*
557 * Remove a counter from the PMU.
558 */
559static void power_perf_disable(struct perf_counter *counter)
560{
561 struct cpu_hw_counters *cpuhw;
562 long i;
563 u64 pmudis;
564 unsigned long flags;
565
566 local_irq_save(flags);
567 pmudis = hw_perf_save_disable();
568
569 power_perf_read(counter);
570
571 cpuhw = &__get_cpu_var(cpu_hw_counters);
572 for (i = 0; i < cpuhw->n_counters; ++i) {
573 if (counter == cpuhw->counter[i]) {
574 while (++i < cpuhw->n_counters)
575 cpuhw->counter[i-1] = cpuhw->counter[i];
576 --cpuhw->n_counters;
577 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
578 write_pmc(counter->hw.idx, 0);
579 counter->hw.idx = 0;
580 perf_counter_update_userpage(counter);
581 break;
582 }
583 }
584 if (cpuhw->n_counters == 0) {
585 /* disable exceptions if no counters are running */
586 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
587 }
588
589 hw_perf_restore(pmudis);
590 local_irq_restore(flags);
591}
592
593struct hw_perf_counter_ops power_perf_ops = {
594 .enable = power_perf_enable,
595 .disable = power_perf_disable,
596 .read = power_perf_read
597};
598
599/* Number of perf_counters counting hardware events */
600static atomic_t num_counters;
601/* Used to avoid races in calling reserve/release_pmc_hardware */
602static DEFINE_MUTEX(pmc_reserve_mutex);
603
604/*
605 * Release the PMU if this is the last perf_counter.
606 */
607static void hw_perf_counter_destroy(struct perf_counter *counter)
608{
609 if (!atomic_add_unless(&num_counters, -1, 1)) {
610 mutex_lock(&pmc_reserve_mutex);
611 if (atomic_dec_return(&num_counters) == 0)
612 release_pmc_hardware();
613 mutex_unlock(&pmc_reserve_mutex);
614 }
615}
616
617const struct hw_perf_counter_ops *
618hw_perf_counter_init(struct perf_counter *counter)
619{
620 unsigned long ev;
621 struct perf_counter *ctrs[MAX_HWCOUNTERS];
622 unsigned int events[MAX_HWCOUNTERS];
623 int n;
624 int err;
625
626 if (!ppmu)
627 return ERR_PTR(-ENXIO);
628 if ((s64)counter->hw_event.irq_period < 0)
629 return ERR_PTR(-EINVAL);
630 if (!perf_event_raw(&counter->hw_event)) {
631 ev = perf_event_id(&counter->hw_event);
632 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
633 return ERR_PTR(-EOPNOTSUPP);
634 ev = ppmu->generic_events[ev];
635 } else {
636 ev = perf_event_config(&counter->hw_event);
637 }
638 counter->hw.config_base = ev;
639 counter->hw.idx = 0;
640
641 /*
642 * If we are not running on a hypervisor, force the
643 * exclude_hv bit to 0 so that we don't care what
644 * the user set it to.
645 */
646 if (!firmware_has_feature(FW_FEATURE_LPAR))
647 counter->hw_event.exclude_hv = 0;
648
649 /*
650 * If this is in a group, check if it can go on with all the
651 * other hardware counters in the group. We assume the counter
652 * hasn't been linked into its leader's sibling list at this point.
653 */
654 n = 0;
655 if (counter->group_leader != counter) {
656 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
657 ctrs, events);
658 if (n < 0)
659 return ERR_PTR(-EINVAL);
660 }
661 events[n] = ev;
662 ctrs[n] = counter;
663 if (check_excludes(ctrs, n, 1))
664 return ERR_PTR(-EINVAL);
665 if (power_check_constraints(events, n + 1))
666 return ERR_PTR(-EINVAL);
667
668 counter->hw.config = events[n];
669 atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
670
671 /*
672 * See if we need to reserve the PMU.
673 * If no counters are currently in use, then we have to take a
674 * mutex to ensure that we don't race with another task doing
675 * reserve_pmc_hardware or release_pmc_hardware.
676 */
677 err = 0;
678 if (!atomic_inc_not_zero(&num_counters)) {
679 mutex_lock(&pmc_reserve_mutex);
680 if (atomic_read(&num_counters) == 0 &&
681 reserve_pmc_hardware(perf_counter_interrupt))
682 err = -EBUSY;
683 else
684 atomic_inc(&num_counters);
685 mutex_unlock(&pmc_reserve_mutex);
686 }
687 counter->destroy = hw_perf_counter_destroy;
688
689 if (err)
690 return ERR_PTR(err);
691 return &power_perf_ops;
692}
693
694/*
695 * A counter has overflowed; update its count and record
696 * things if requested. Note that interrupts are hard-disabled
697 * here so there is no possibility of being interrupted.
698 */
699static void record_and_restart(struct perf_counter *counter, long val,
700 struct pt_regs *regs)
701{
702 s64 prev, delta, left;
703 int record = 0;
704
705 /* we don't have to worry about interrupts here */
706 prev = atomic64_read(&counter->hw.prev_count);
707 delta = (val - prev) & 0xfffffffful;
708 atomic64_add(delta, &counter->count);
709
710 /*
711 * See if the total period for this counter has expired,
712 * and update for the next period.
713 */
714 val = 0;
715 left = atomic64_read(&counter->hw.period_left) - delta;
716 if (counter->hw_event.irq_period) {
717 if (left <= 0) {
718 left += counter->hw_event.irq_period;
719 if (left <= 0)
720 left = counter->hw_event.irq_period;
721 record = 1;
722 }
723 if (left < 0x80000000L)
724 val = 0x80000000L - left;
725 }
726 write_pmc(counter->hw.idx, val);
727 atomic64_set(&counter->hw.prev_count, val);
728 atomic64_set(&counter->hw.period_left, left);
729 perf_counter_update_userpage(counter);
730
731 /*
732 * Finally record data if requested.
733 */
734 if (record)
735 perf_counter_overflow(counter, 1, regs);
736}
737
738/*
739 * Performance monitor interrupt stuff
740 */
741static void perf_counter_interrupt(struct pt_regs *regs)
742{
743 int i;
744 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
745 struct perf_counter *counter;
746 long val;
747 int found = 0;
748
749 for (i = 0; i < cpuhw->n_counters; ++i) {
750 counter = cpuhw->counter[i];
751 val = read_pmc(counter->hw.idx);
752 if ((int)val < 0) {
753 /* counter has overflowed */
754 found = 1;
755 record_and_restart(counter, val, regs);
756 }
757 }
758
759 /*
760 * In case we didn't find and reset the counter that caused
761 * the interrupt, scan all counters and reset any that are
762 * negative, to avoid getting continual interrupts.
763 * Any that we processed in the previous loop will not be negative.
764 */
765 if (!found) {
766 for (i = 0; i < ppmu->n_counter; ++i) {
767 val = read_pmc(i + 1);
768 if ((int)val < 0)
769 write_pmc(i + 1, 0);
770 }
771 }
772
773 /*
774 * Reset MMCR0 to its normal value. This will set PMXE and
775 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
776 * and thus allow interrupts to occur again.
777 * XXX might want to use MSR.PM to keep the counters frozen until
778 * we get back out of this interrupt.
779 */
780 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
781
782 /*
783 * If we need a wakeup, check whether interrupts were soft-enabled
784 * when we took the interrupt. If they were, we can wake stuff up
785 * immediately; otherwise we'll have do the wakeup when interrupts
786 * get soft-enabled.
787 */
788 if (test_perf_counter_pending() && regs->softe) {
789 irq_enter();
790 clear_perf_counter_pending();
791 perf_counter_do_pending();
792 irq_exit();
793 }
794}
795
796void hw_perf_counter_setup(int cpu)
797{
798 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
799
800 memset(cpuhw, 0, sizeof(*cpuhw));
801 cpuhw->mmcr[0] = MMCR0_FC;
802}
803
804extern struct power_pmu power4_pmu;
805extern struct power_pmu ppc970_pmu;
806extern struct power_pmu power5_pmu;
807extern struct power_pmu power5p_pmu;
808extern struct power_pmu power6_pmu;
809
810static int init_perf_counters(void)
811{
812 unsigned long pvr;
813
814 /* XXX should get this from cputable */
815 pvr = mfspr(SPRN_PVR);
816 switch (PVR_VER(pvr)) {
817 case PV_POWER4:
818 case PV_POWER4p:
819 ppmu = &power4_pmu;
820 break;
821 case PV_970:
822 case PV_970FX:
823 case PV_970MP:
824 ppmu = &ppc970_pmu;
825 break;
826 case PV_POWER5:
827 ppmu = &power5_pmu;
828 break;
829 case PV_POWER5p:
830 ppmu = &power5p_pmu;
831 break;
832 case 0x3e:
833 ppmu = &power6_pmu;
834 break;
835 }
836
837 /*
838 * Use FCHV to ignore kernel events if MSR.HV is set.
839 */
840 if (mfmsr() & MSR_HV)
841 freeze_counters_kernel = MMCR0_FCHV;
842
843 return 0;
844}
845
846arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 000000000000..1407b19ab619
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
1/*
2 * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER4
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_LOWER_SH 6
23#define PM_LOWER_MSK 1
24#define PM_LOWER_MSKS 0x40
25#define PM_BYTE_SH 4 /* Byte number of event bus to use */
26#define PM_BYTE_MSK 3
27#define PM_PMCSEL_MSK 7
28
29/*
30 * Unit code values
31 */
32#define PM_FPU 1
33#define PM_ISU1 2
34#define PM_IFU 3
35#define PM_IDU0 4
36#define PM_ISU1_ALT 6
37#define PM_ISU2 7
38#define PM_IFU_ALT 8
39#define PM_LSU0 9
40#define PM_LSU1 0xc
41#define PM_GPS 0xf
42
43/*
44 * Bits in MMCR0 for POWER4
45 */
46#define MMCR0_PMC1SEL_SH 8
47#define MMCR0_PMC2SEL_SH 1
48#define MMCR_PMCSEL_MSK 0x1f
49
50/*
51 * Bits in MMCR1 for POWER4
52 */
53#define MMCR1_TTM0SEL_SH 62
54#define MMCR1_TTC0SEL_SH 61
55#define MMCR1_TTM1SEL_SH 59
56#define MMCR1_TTC1SEL_SH 58
57#define MMCR1_TTM2SEL_SH 56
58#define MMCR1_TTC2SEL_SH 55
59#define MMCR1_TTM3SEL_SH 53
60#define MMCR1_TTC3SEL_SH 52
61#define MMCR1_TTMSEL_MSK 3
62#define MMCR1_TD_CP_DBG0SEL_SH 50
63#define MMCR1_TD_CP_DBG1SEL_SH 48
64#define MMCR1_TD_CP_DBG2SEL_SH 46
65#define MMCR1_TD_CP_DBG3SEL_SH 44
66#define MMCR1_DEBUG0SEL_SH 43
67#define MMCR1_DEBUG1SEL_SH 42
68#define MMCR1_DEBUG2SEL_SH 41
69#define MMCR1_DEBUG3SEL_SH 40
70#define MMCR1_PMC1_ADDER_SEL_SH 39
71#define MMCR1_PMC2_ADDER_SEL_SH 38
72#define MMCR1_PMC6_ADDER_SEL_SH 37
73#define MMCR1_PMC5_ADDER_SEL_SH 36
74#define MMCR1_PMC8_ADDER_SEL_SH 35
75#define MMCR1_PMC7_ADDER_SEL_SH 34
76#define MMCR1_PMC3_ADDER_SEL_SH 33
77#define MMCR1_PMC4_ADDER_SEL_SH 32
78#define MMCR1_PMC3SEL_SH 27
79#define MMCR1_PMC4SEL_SH 22
80#define MMCR1_PMC5SEL_SH 17
81#define MMCR1_PMC6SEL_SH 12
82#define MMCR1_PMC7SEL_SH 7
83#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
84
85static short mmcr1_adder_bits[8] = {
86 MMCR1_PMC1_ADDER_SEL_SH,
87 MMCR1_PMC2_ADDER_SEL_SH,
88 MMCR1_PMC3_ADDER_SEL_SH,
89 MMCR1_PMC4_ADDER_SEL_SH,
90 MMCR1_PMC5_ADDER_SEL_SH,
91 MMCR1_PMC6_ADDER_SEL_SH,
92 MMCR1_PMC7_ADDER_SEL_SH,
93 MMCR1_PMC8_ADDER_SEL_SH
94};
95
96/*
97 * Bits in MMCRA
98 */
99#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
100
101/*
102 * Layout of constraint bits:
103 * 6666555555555544444444443333333333222222222211111111110000000000
104 * 3210987654321098765432109876543210987654321098765432109876543210
105 * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
106 * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
107 * \SMPL ||\TTC3SEL
108 * |\TTC_IFU_SEL
109 * \TTM2SEL0
110 *
111 * SMPL - SAMPLE_ENABLE constraint
112 * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
113 *
114 * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
115 * 55: UC1 error 0x0080_0000_0000_0000
116 * 54: FPU events needed 0x0040_0000_0000_0000
117 * 53: ISU1 events needed 0x0020_0000_0000_0000
118 * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
119 *
120 * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
121 * 51: UC2 error 0x0008_0000_0000_0000
122 * 50: FPU events needed 0x0004_0000_0000_0000
123 * 49: IFU events needed 0x0002_0000_0000_0000
124 * 48: LSU0 events needed 0x0001_0000_0000_0000
125 *
126 * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
127 * 47: UC3 error 0x8000_0000_0000
128 * 46: LSU0 events needed 0x4000_0000_0000
129 * 45: IFU events needed 0x2000_0000_0000
130 * 44: IDU0|ISU2 events needed 0x1000_0000_0000
131 * 43: ISU1 events needed 0x0800_0000_0000
132 *
133 * TTM2SEL0
134 * 42: 0 = IDU0 events needed
135 * 1 = ISU2 events needed 0x0400_0000_0000
136 *
137 * TTC_IFU_SEL
138 * 41: 0 = IFU.U events needed
139 * 1 = IFU.L events needed 0x0200_0000_0000
140 *
141 * TTC3SEL
142 * 40: 0 = LSU1.U events needed
143 * 1 = LSU1.L events needed 0x0100_0000_0000
144 *
145 * PS1
146 * 39: PS1 error 0x0080_0000_0000
147 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
148 *
149 * PS2
150 * 35: PS2 error 0x0008_0000_0000
151 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
152 *
153 * B0
154 * 28-31: Byte 0 event source 0xf000_0000
155 * 1 = FPU
156 * 2 = ISU1
157 * 3 = IFU
158 * 4 = IDU0
159 * 7 = ISU2
160 * 9 = LSU0
161 * c = LSU1
162 * f = GPS
163 *
164 * B1, B2, B3
165 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
166 *
167 * P8
168 * 15: P8 error 0x8000
169 * 14-15: Count of events needing PMC8
170 *
171 * P1..P7
172 * 0-13: Count of events needing PMC1..PMC7
173 *
174 * Note: this doesn't allow events using IFU.U to be combined with events
175 * using IFU.L, though that is feasible (using TTM0 and TTM2). However
176 * there are no listed events for IFU.L (they are debug events not
177 * verified for performance monitoring) so this shouldn't cause a
178 * problem.
179 */
180
181static struct unitinfo {
182 u64 value, mask;
183 int unit;
184 int lowerbit;
185} p4_unitinfo[16] = {
186 [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
187 [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
188 [PM_ISU1_ALT] =
189 { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
190 [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
191 [PM_IFU_ALT] =
192 { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
193 [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
194 [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
195 [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
196 [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
197 [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
198};
199
200static unsigned char direct_marked_event[8] = {
201 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
202 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
203 (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
204 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
205 (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
206 (1<<3) | (1<<4) | (1<<5),
207 /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
208 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
209 (1<<4), /* PMC8: PM_MRK_LSU_FIN */
210};
211
212/*
213 * Returns 1 if event counts things relating to marked instructions
214 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
215 */
216static int p4_marked_instr_event(unsigned int event)
217{
218 int pmc, psel, unit, byte, bit;
219 unsigned int mask;
220
221 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
222 psel = event & PM_PMCSEL_MSK;
223 if (pmc) {
224 if (direct_marked_event[pmc - 1] & (1 << psel))
225 return 1;
226 if (psel == 0) /* add events */
227 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
228 else if (psel == 6) /* decode events */
229 bit = 4;
230 else
231 return 0;
232 } else
233 bit = psel;
234
235 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
236 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
237 mask = 0;
238 switch (unit) {
239 case PM_LSU1:
240 if (event & PM_LOWER_MSKS)
241 mask = 1 << 28; /* byte 7 bit 4 */
242 else
243 mask = 6 << 24; /* byte 3 bits 1 and 2 */
244 break;
245 case PM_LSU0:
246 /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
247 mask = 0x083dff00;
248 }
249 return (mask >> (byte * 8 + bit)) & 1;
250}
251
252static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
253{
254 int pmc, byte, unit, lower, sh;
255 u64 mask = 0, value = 0;
256 int grp = -1;
257
258 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
259 if (pmc) {
260 if (pmc > 8)
261 return -1;
262 sh = (pmc - 1) * 2;
263 mask |= 2 << sh;
264 value |= 1 << sh;
265 grp = ((pmc - 1) >> 1) & 1;
266 }
267 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
268 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
269 if (unit) {
270 lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
271
272 /*
273 * Bus events on bytes 0 and 2 can be counted
274 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
275 */
276 if (!pmc)
277 grp = byte & 1;
278
279 if (!p4_unitinfo[unit].unit)
280 return -1;
281 mask |= p4_unitinfo[unit].mask;
282 value |= p4_unitinfo[unit].value;
283 sh = p4_unitinfo[unit].lowerbit;
284 if (sh > 1)
285 value |= (u64)lower << sh;
286 else if (lower != sh)
287 return -1;
288 unit = p4_unitinfo[unit].unit;
289
290 /* Set byte lane select field */
291 mask |= 0xfULL << (28 - 4 * byte);
292 value |= (u64)unit << (28 - 4 * byte);
293 }
294 if (grp == 0) {
295 /* increment PMC1/2/5/6 field */
296 mask |= 0x8000000000ull;
297 value |= 0x1000000000ull;
298 } else {
299 /* increment PMC3/4/7/8 field */
300 mask |= 0x800000000ull;
301 value |= 0x100000000ull;
302 }
303
304 /* Marked instruction events need sample_enable set */
305 if (p4_marked_instr_event(event)) {
306 mask |= 1ull << 56;
307 value |= 1ull << 56;
308 }
309
310 /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
311 if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
312 mask |= 1ull << 56;
313
314 *maskp = mask;
315 *valp = value;
316 return 0;
317}
318
319static unsigned int ppc_inst_cmpl[] = {
320 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
321};
322
323static int p4_get_alternatives(unsigned int event, unsigned int alt[])
324{
325 int i, j, na;
326
327 alt[0] = event;
328 na = 1;
329
330 /* 2 possibilities for PM_GRP_DISP_REJECT */
331 if (event == 0x8003 || event == 0x0224) {
332 alt[1] = event ^ (0x8003 ^ 0x0224);
333 return 2;
334 }
335
336 /* 2 possibilities for PM_ST_MISS_L1 */
337 if (event == 0x0c13 || event == 0x0c23) {
338 alt[1] = event ^ (0x0c13 ^ 0x0c23);
339 return 2;
340 }
341
342 /* several possibilities for PM_INST_CMPL */
343 for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
344 if (event == ppc_inst_cmpl[i]) {
345 for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
346 if (j != i)
347 alt[na++] = ppc_inst_cmpl[j];
348 break;
349 }
350 }
351
352 return na;
353}
354
355static int p4_compute_mmcr(unsigned int event[], int n_ev,
356 unsigned int hwc[], u64 mmcr[])
357{
358 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
359 unsigned int pmc, unit, byte, psel, lower;
360 unsigned int ttm, grp;
361 unsigned int pmc_inuse = 0;
362 unsigned int pmc_grp_use[2];
363 unsigned char busbyte[4];
364 unsigned char unituse[16];
365 unsigned int unitlower = 0;
366 int i;
367
368 if (n_ev > 8)
369 return -1;
370
371 /* First pass to count resource use */
372 pmc_grp_use[0] = pmc_grp_use[1] = 0;
373 memset(busbyte, 0, sizeof(busbyte));
374 memset(unituse, 0, sizeof(unituse));
375 for (i = 0; i < n_ev; ++i) {
376 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
377 if (pmc) {
378 if (pmc_inuse & (1 << (pmc - 1)))
379 return -1;
380 pmc_inuse |= 1 << (pmc - 1);
381 /* count 1/2/5/6 vs 3/4/7/8 use */
382 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
383 }
384 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
385 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
386 lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
387 if (unit) {
388 if (!pmc)
389 ++pmc_grp_use[byte & 1];
390 if (unit == 6 || unit == 8)
391 /* map alt ISU1/IFU codes: 6->2, 8->3 */
392 unit = (unit >> 1) - 1;
393 if (busbyte[byte] && busbyte[byte] != unit)
394 return -1;
395 busbyte[byte] = unit;
396 lower <<= unit;
397 if (unituse[unit] && lower != (unitlower & lower))
398 return -1;
399 unituse[unit] = 1;
400 unitlower |= lower;
401 }
402 }
403 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
404 return -1;
405
406 /*
407 * Assign resources and set multiplexer selects.
408 *
409 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
410 * Each TTMx can only select one unit, but since
411 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
412 * we have some choices.
413 */
414 if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
415 unituse[6] = 1; /* Move 2 to 6 */
416 unituse[2] = 0;
417 }
418 if (unituse[3] & (unituse[1] | unituse[2])) {
419 unituse[8] = 1; /* Move 3 to 8 */
420 unituse[3] = 0;
421 unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
422 }
423 /* Check only one unit per TTMx */
424 if (unituse[1] + unituse[2] + unituse[3] > 1 ||
425 unituse[4] + unituse[6] + unituse[7] > 1 ||
426 unituse[8] + unituse[9] > 1 ||
427 (unituse[5] | unituse[10] | unituse[11] |
428 unituse[13] | unituse[14]))
429 return -1;
430
431 /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
432 mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
433 mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
434 mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
435
436 /* Set TTCxSEL fields. */
437 if (unitlower & 0xe)
438 mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
439 if (unitlower & 0xf0)
440 mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
441 if (unitlower & 0xf00)
442 mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
443 if (unitlower & 0x7000)
444 mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
445
446 /* Set byte lane select fields. */
447 for (byte = 0; byte < 4; ++byte) {
448 unit = busbyte[byte];
449 if (!unit)
450 continue;
451 if (unit == 0xf) {
452 /* special case for GPS */
453 mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
454 } else {
455 if (!unituse[unit])
456 ttm = unit - 1; /* 2->1, 3->2 */
457 else
458 ttm = unit >> 2;
459 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
460 }
461 }
462
463 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
464 for (i = 0; i < n_ev; ++i) {
465 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
466 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
467 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
468 psel = event[i] & PM_PMCSEL_MSK;
469 if (!pmc) {
470 /* Bus event or 00xxx direct event (off or cycles) */
471 if (unit)
472 psel |= 0x10 | ((byte & 2) << 2);
473 for (pmc = 0; pmc < 8; ++pmc) {
474 if (pmc_inuse & (1 << pmc))
475 continue;
476 grp = (pmc >> 1) & 1;
477 if (unit) {
478 if (grp == (byte & 1))
479 break;
480 } else if (pmc_grp_use[grp] < 4) {
481 ++pmc_grp_use[grp];
482 break;
483 }
484 }
485 pmc_inuse |= 1 << pmc;
486 } else {
487 /* Direct event */
488 --pmc;
489 if (psel == 0 && (byte & 2))
490 /* add events on higher-numbered bus */
491 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
492 else if (psel == 6 && byte == 3)
493 /* seem to need to set sample_enable here */
494 mmcra |= MMCRA_SAMPLE_ENABLE;
495 psel |= 8;
496 }
497 if (pmc <= 1)
498 mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
499 else
500 mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
501 if (pmc == 7) /* PMC8 */
502 mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
503 hwc[i] = pmc;
504 if (p4_marked_instr_event(event[i]))
505 mmcra |= MMCRA_SAMPLE_ENABLE;
506 }
507
508 if (pmc_inuse & 1)
509 mmcr0 |= MMCR0_PMC1CE;
510 if (pmc_inuse & 0xfe)
511 mmcr0 |= MMCR0_PMCjCE;
512
513 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
514
515 /* Return MMCRx values */
516 mmcr[0] = mmcr0;
517 mmcr[1] = mmcr1;
518 mmcr[2] = mmcra;
519 return 0;
520}
521
522static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
523{
524 /*
525 * Setting the PMCxSEL field to 0 disables PMC x.
526 * (Note that pmc is 0-based here, not 1-based.)
527 */
528 if (pmc <= 1) {
529 mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
530 } else {
531 mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
532 if (pmc == 7)
533 mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
534 }
535}
536
537static int p4_generic_events[] = {
538 [PERF_COUNT_CPU_CYCLES] = 7,
539 [PERF_COUNT_INSTRUCTIONS] = 0x1001,
540 [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
541 [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
542 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
543 [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
544};
545
546struct power_pmu power4_pmu = {
547 .n_counter = 8,
548 .max_alternatives = 5,
549 .add_fields = 0x0000001100005555ull,
550 .test_adder = 0x0011083300000000ull,
551 .compute_mmcr = p4_compute_mmcr,
552 .get_constraint = p4_get_constraint,
553 .get_alternatives = p4_get_alternatives,
554 .disable_pmc = p4_disable_pmc,
555 .n_generic = ARRAY_SIZE(p4_generic_events),
556 .generic_events = p4_generic_events,
557};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 000000000000..cec21ea65b0e
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,452 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5+
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * [ ><><>< ><> <><>[ > < >< >< >< ><><><><>
82 * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1
83 *
84 * NC - number of counters
85 * 51: NC error 0x0008_0000_0000_0000
86 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
87 *
88 * G0..G3 - GRS mux constraints
89 * 46-47: GRS_L2SEL value
90 * 44-45: GRS_L3SEL value
91 * 41-44: GRS_MCSEL value
92 * 39-40: GRS_FABSEL value
93 * Note that these match up with their bit positions in MMCR1
94 *
95 * T0 - TTM0 constraint
96 * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
97 *
98 * T1 - TTM1 constraint
99 * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 33: UC3 error 0x02_0000_0000
103 * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
104 * 31: ISU0 events needed 0x01_8000_0000
105 * 30: IDU|GRS events needed 0x00_4000_0000
106 *
107 * B0
108 * 20-23: Byte 0 event source 0x00f0_0000
109 * Encoding as for the event code
110 *
111 * B1, B2, B3
112 * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
113 *
114 * P4
115 * 7: P1 error 0x80
116 * 6-7: Count of events needing PMC4
117 *
118 * P1..P3
119 * 0-6: Count of events needing PMC1..PMC3
120 */
121
122static const int grsel_shift[8] = {
123 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
124 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
125 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
126};
127
128/* Masks and values for using events from the various units */
129static u64 unit_cons[PM_LASTUNIT+1][2] = {
130 [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
131 [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
132 [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
133 [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
134 [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
135 [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
136};
137
138static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
139{
140 int pmc, byte, unit, sh;
141 int bit, fmask;
142 u64 mask = 0, value = 0;
143
144 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
145 if (pmc) {
146 if (pmc > 4)
147 return -1;
148 sh = (pmc - 1) * 2;
149 mask |= 2 << sh;
150 value |= 1 << sh;
151 }
152 if (event & PM_BUSEVENT_MSK) {
153 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
154 if (unit > PM_LASTUNIT)
155 return -1;
156 if (unit == PM_ISU0_ALT)
157 unit = PM_ISU0;
158 mask |= unit_cons[unit][0];
159 value |= unit_cons[unit][1];
160 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
161 if (byte >= 4) {
162 if (unit != PM_LSU1)
163 return -1;
164 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
165 ++unit;
166 byte &= 3;
167 }
168 if (unit == PM_GRS) {
169 bit = event & 7;
170 fmask = (bit == 6)? 7: 3;
171 sh = grsel_shift[bit];
172 mask |= (u64)fmask << sh;
173 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
174 }
175 /* Set byte lane select field */
176 mask |= 0xfULL << (20 - 4 * byte);
177 value |= (u64)unit << (20 - 4 * byte);
178 }
179 mask |= 0x8000000000000ull;
180 value |= 0x1000000000000ull;
181 *maskp = mask;
182 *valp = value;
183 return 0;
184}
185
186#define MAX_ALT 3 /* at most 3 alternatives for any event */
187
188static const unsigned int event_alternatives[][MAX_ALT] = {
189 { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
190 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
191 { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
192 { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
193 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
194 { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
195 { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
196 { 0x100009, 0x200009 }, /* PM_INST_CMPL */
197 { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
198 { 0x300009, 0x400009 }, /* PM_INST_DISP */
199};
200
201/*
202 * Scan the alternatives table for a match and return the
203 * index into the alternatives table if found, else -1.
204 */
205static int find_alternative(unsigned int event)
206{
207 int i, j;
208
209 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
210 if (event < event_alternatives[i][0])
211 break;
212 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
213 if (event == event_alternatives[i][j])
214 return i;
215 }
216 return -1;
217}
218
219static const unsigned char bytedecode_alternatives[4][4] = {
220 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
221 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
222 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
223 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
224};
225
226/*
227 * Some direct events for decodes of event bus byte 3 have alternative
228 * PMCSEL values on other counters. This returns the alternative
229 * event code for those that do, or -1 otherwise. This also handles
230 * alternative PCMSEL values for add events.
231 */
232static int find_alternative_bdecode(unsigned int event)
233{
234 int pmc, altpmc, pp, j;
235
236 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
237 if (pmc == 0 || pmc > 4)
238 return -1;
239 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
240 pp = event & PM_PMCSEL_MSK;
241 for (j = 0; j < 4; ++j) {
242 if (bytedecode_alternatives[pmc - 1][j] == pp) {
243 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
244 (altpmc << PM_PMC_SH) |
245 bytedecode_alternatives[altpmc - 1][j];
246 }
247 }
248
249 /* new decode alternatives for power5+ */
250 if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
251 return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
252 if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
253 return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
254
255 /* alternative add event encodings */
256 if (pp == 0x10 || pp == 0x28)
257 return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
258 (altpmc << PM_PMC_SH);
259
260 return -1;
261}
262
263static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
264{
265 int i, j, ae, nalt = 1;
266
267 alt[0] = event;
268 nalt = 1;
269 i = find_alternative(event);
270 if (i >= 0) {
271 for (j = 0; j < MAX_ALT; ++j) {
272 ae = event_alternatives[i][j];
273 if (ae && ae != event)
274 alt[nalt++] = ae;
275 }
276 } else {
277 ae = find_alternative_bdecode(event);
278 if (ae > 0)
279 alt[nalt++] = ae;
280 }
281 return nalt;
282}
283
284static int power5p_compute_mmcr(unsigned int event[], int n_ev,
285 unsigned int hwc[], u64 mmcr[])
286{
287 u64 mmcr1 = 0;
288 unsigned int pmc, unit, byte, psel;
289 unsigned int ttm;
290 int i, isbus, bit, grsel;
291 unsigned int pmc_inuse = 0;
292 unsigned char busbyte[4];
293 unsigned char unituse[16];
294 int ttmuse;
295
296 if (n_ev > 4)
297 return -1;
298
299 /* First pass to count resource use */
300 memset(busbyte, 0, sizeof(busbyte));
301 memset(unituse, 0, sizeof(unituse));
302 for (i = 0; i < n_ev; ++i) {
303 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
304 if (pmc) {
305 if (pmc > 4)
306 return -1;
307 if (pmc_inuse & (1 << (pmc - 1)))
308 return -1;
309 pmc_inuse |= 1 << (pmc - 1);
310 }
311 if (event[i] & PM_BUSEVENT_MSK) {
312 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
313 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
314 if (unit > PM_LASTUNIT)
315 return -1;
316 if (unit == PM_ISU0_ALT)
317 unit = PM_ISU0;
318 if (byte >= 4) {
319 if (unit != PM_LSU1)
320 return -1;
321 ++unit;
322 byte &= 3;
323 }
324 if (busbyte[byte] && busbyte[byte] != unit)
325 return -1;
326 busbyte[byte] = unit;
327 unituse[unit] = 1;
328 }
329 }
330
331 /*
332 * Assign resources and set multiplexer selects.
333 *
334 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
335 * choice we have to deal with.
336 */
337 if (unituse[PM_ISU0] &
338 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
339 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
340 unituse[PM_ISU0] = 0;
341 }
342 /* Set TTM[01]SEL fields. */
343 ttmuse = 0;
344 for (i = PM_FPU; i <= PM_ISU1; ++i) {
345 if (!unituse[i])
346 continue;
347 if (ttmuse++)
348 return -1;
349 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
350 }
351 ttmuse = 0;
352 for (; i <= PM_GRS; ++i) {
353 if (!unituse[i])
354 continue;
355 if (ttmuse++)
356 return -1;
357 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
358 }
359 if (ttmuse > 1)
360 return -1;
361
362 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
363 for (byte = 0; byte < 4; ++byte) {
364 unit = busbyte[byte];
365 if (!unit)
366 continue;
367 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
368 /* get ISU0 through TTM1 rather than TTM0 */
369 unit = PM_ISU0_ALT;
370 } else if (unit == PM_LSU1 + 1) {
371 /* select lower word of LSU1 for this byte */
372 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
373 }
374 ttm = unit >> 2;
375 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
376 }
377
378 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
379 for (i = 0; i < n_ev; ++i) {
380 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
381 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
382 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
383 psel = event[i] & PM_PMCSEL_MSK;
384 isbus = event[i] & PM_BUSEVENT_MSK;
385 if (!pmc) {
386 /* Bus event or any-PMC direct event */
387 for (pmc = 0; pmc < 4; ++pmc) {
388 if (!(pmc_inuse & (1 << pmc)))
389 break;
390 }
391 if (pmc >= 4)
392 return -1;
393 pmc_inuse |= 1 << pmc;
394 } else {
395 /* Direct event */
396 --pmc;
397 if (isbus && (byte & 2) &&
398 (psel == 8 || psel == 0x10 || psel == 0x28))
399 /* add events on higher-numbered bus */
400 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
401 }
402 if (isbus && unit == PM_GRS) {
403 bit = psel & 7;
404 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
405 mmcr1 |= (u64)grsel << grsel_shift[bit];
406 }
407 if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
408 /* select alternate byte lane */
409 psel |= 0x10;
410 if (pmc <= 3)
411 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
412 hwc[i] = pmc;
413 }
414
415 /* Return MMCRx values */
416 mmcr[0] = 0;
417 if (pmc_inuse & 1)
418 mmcr[0] = MMCR0_PMC1CE;
419 if (pmc_inuse & 0x3e)
420 mmcr[0] |= MMCR0_PMCjCE;
421 mmcr[1] = mmcr1;
422 mmcr[2] = 0;
423 return 0;
424}
425
426static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
427{
428 if (pmc <= 3)
429 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
430}
431
432static int power5p_generic_events[] = {
433 [PERF_COUNT_CPU_CYCLES] = 0xf,
434 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
435 [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
436 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
437 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
438 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
439};
440
441struct power_pmu power5p_pmu = {
442 .n_counter = 4,
443 .max_alternatives = MAX_ALT,
444 .add_fields = 0x7000000000055ull,
445 .test_adder = 0x3000040000000ull,
446 .compute_mmcr = power5p_compute_mmcr,
447 .get_constraint = power5p_get_constraint,
448 .get_alternatives = power5p_get_alternatives,
449 .disable_pmc = power5p_disable_pmc,
450 .n_generic = ARRAY_SIZE(power5p_generic_events),
451 .generic_events = power5p_generic_events,
452};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 000000000000..379ed1087cca
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,475 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5 (not POWER5++)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
82 * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * T0 - TTM0 constraint
85 * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
86 *
87 * T1 - TTM1 constraint
88 * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
89 *
90 * NC - number of counters
91 * 51: NC error 0x0008_0000_0000_0000
92 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
93 *
94 * G0..G3 - GRS mux constraints
95 * 46-47: GRS_L2SEL value
96 * 44-45: GRS_L3SEL value
97 * 41-44: GRS_MCSEL value
98 * 39-40: GRS_FABSEL value
99 * Note that these match up with their bit positions in MMCR1
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 37: UC3 error 0x20_0000_0000
103 * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
104 * 35: ISU0 events needed 0x08_0000_0000
105 * 34: IDU|GRS events needed 0x04_0000_0000
106 *
107 * PS1
108 * 33: PS1 error 0x2_0000_0000
109 * 31-32: count of events needing PMC1/2 0x1_8000_0000
110 *
111 * PS2
112 * 30: PS2 error 0x4000_0000
113 * 28-29: count of events needing PMC3/4 0x3000_0000
114 *
115 * B0
116 * 24-27: Byte 0 event source 0x0f00_0000
117 * Encoding as for the event code
118 *
119 * B1, B2, B3
120 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
121 *
122 * P1..P6
123 * 0-11: Count of events needing PMC1..PMC6
124 */
125
126static const int grsel_shift[8] = {
127 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
128 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
129 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
130};
131
132/* Masks and values for using events from the various units */
133static u64 unit_cons[PM_LASTUNIT+1][2] = {
134 [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
135 [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
136 [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
137 [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
138 [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
139 [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
140};
141
142static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
143{
144 int pmc, byte, unit, sh;
145 int bit, fmask;
146 u64 mask = 0, value = 0;
147 int grp = -1;
148
149 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
150 if (pmc) {
151 if (pmc > 6)
152 return -1;
153 sh = (pmc - 1) * 2;
154 mask |= 2 << sh;
155 value |= 1 << sh;
156 if (pmc <= 4)
157 grp = (pmc - 1) >> 1;
158 else if (event != 0x500009 && event != 0x600005)
159 return -1;
160 }
161 if (event & PM_BUSEVENT_MSK) {
162 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
163 if (unit > PM_LASTUNIT)
164 return -1;
165 if (unit == PM_ISU0_ALT)
166 unit = PM_ISU0;
167 mask |= unit_cons[unit][0];
168 value |= unit_cons[unit][1];
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 if (byte >= 4) {
171 if (unit != PM_LSU1)
172 return -1;
173 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
174 ++unit;
175 byte &= 3;
176 }
177 if (unit == PM_GRS) {
178 bit = event & 7;
179 fmask = (bit == 6)? 7: 3;
180 sh = grsel_shift[bit];
181 mask |= (u64)fmask << sh;
182 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
183 }
184 /*
185 * Bus events on bytes 0 and 2 can be counted
186 * on PMC1/2; bytes 1 and 3 on PMC3/4.
187 */
188 if (!pmc)
189 grp = byte & 1;
190 /* Set byte lane select field */
191 mask |= 0xfULL << (24 - 4 * byte);
192 value |= (u64)unit << (24 - 4 * byte);
193 }
194 if (grp == 0) {
195 /* increment PMC1/2 field */
196 mask |= 0x200000000ull;
197 value |= 0x080000000ull;
198 } else if (grp == 1) {
199 /* increment PMC3/4 field */
200 mask |= 0x40000000ull;
201 value |= 0x10000000ull;
202 }
203 if (pmc < 5) {
204 /* need a counter from PMC1-4 set */
205 mask |= 0x8000000000000ull;
206 value |= 0x1000000000000ull;
207 }
208 *maskp = mask;
209 *valp = value;
210 return 0;
211}
212
213#define MAX_ALT 3 /* at most 3 alternatives for any event */
214
215static const unsigned int event_alternatives[][MAX_ALT] = {
216 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
217 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
218 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
219 { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
220 { 0x300009, 0x400009 }, /* PM_INST_DISP */
221};
222
223/*
224 * Scan the alternatives table for a match and return the
225 * index into the alternatives table if found, else -1.
226 */
227static int find_alternative(unsigned int event)
228{
229 int i, j;
230
231 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
232 if (event < event_alternatives[i][0])
233 break;
234 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
235 if (event == event_alternatives[i][j])
236 return i;
237 }
238 return -1;
239}
240
241static const unsigned char bytedecode_alternatives[4][4] = {
242 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
243 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
244 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
245 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
246};
247
248/*
249 * Some direct events for decodes of event bus byte 3 have alternative
250 * PMCSEL values on other counters. This returns the alternative
251 * event code for those that do, or -1 otherwise.
252 */
253static int find_alternative_bdecode(unsigned int event)
254{
255 int pmc, altpmc, pp, j;
256
257 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
258 if (pmc == 0 || pmc > 4)
259 return -1;
260 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
261 pp = event & PM_PMCSEL_MSK;
262 for (j = 0; j < 4; ++j) {
263 if (bytedecode_alternatives[pmc - 1][j] == pp) {
264 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
265 (altpmc << PM_PMC_SH) |
266 bytedecode_alternatives[altpmc - 1][j];
267 }
268 }
269 return -1;
270}
271
272static int power5_get_alternatives(unsigned int event, unsigned int alt[])
273{
274 int i, j, ae, nalt = 1;
275
276 alt[0] = event;
277 nalt = 1;
278 i = find_alternative(event);
279 if (i >= 0) {
280 for (j = 0; j < MAX_ALT; ++j) {
281 ae = event_alternatives[i][j];
282 if (ae && ae != event)
283 alt[nalt++] = ae;
284 }
285 } else {
286 ae = find_alternative_bdecode(event);
287 if (ae > 0)
288 alt[nalt++] = ae;
289 }
290 return nalt;
291}
292
293static int power5_compute_mmcr(unsigned int event[], int n_ev,
294 unsigned int hwc[], u64 mmcr[])
295{
296 u64 mmcr1 = 0;
297 unsigned int pmc, unit, byte, psel;
298 unsigned int ttm, grp;
299 int i, isbus, bit, grsel;
300 unsigned int pmc_inuse = 0;
301 unsigned int pmc_grp_use[2];
302 unsigned char busbyte[4];
303 unsigned char unituse[16];
304 int ttmuse;
305
306 if (n_ev > 6)
307 return -1;
308
309 /* First pass to count resource use */
310 pmc_grp_use[0] = pmc_grp_use[1] = 0;
311 memset(busbyte, 0, sizeof(busbyte));
312 memset(unituse, 0, sizeof(unituse));
313 for (i = 0; i < n_ev; ++i) {
314 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
315 if (pmc) {
316 if (pmc > 6)
317 return -1;
318 if (pmc_inuse & (1 << (pmc - 1)))
319 return -1;
320 pmc_inuse |= 1 << (pmc - 1);
321 /* count 1/2 vs 3/4 use */
322 if (pmc <= 4)
323 ++pmc_grp_use[(pmc - 1) >> 1];
324 }
325 if (event[i] & PM_BUSEVENT_MSK) {
326 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
327 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
328 if (unit > PM_LASTUNIT)
329 return -1;
330 if (unit == PM_ISU0_ALT)
331 unit = PM_ISU0;
332 if (byte >= 4) {
333 if (unit != PM_LSU1)
334 return -1;
335 ++unit;
336 byte &= 3;
337 }
338 if (!pmc)
339 ++pmc_grp_use[byte & 1];
340 if (busbyte[byte] && busbyte[byte] != unit)
341 return -1;
342 busbyte[byte] = unit;
343 unituse[unit] = 1;
344 }
345 }
346 if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
347 return -1;
348
349 /*
350 * Assign resources and set multiplexer selects.
351 *
352 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
353 * choice we have to deal with.
354 */
355 if (unituse[PM_ISU0] &
356 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
357 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
358 unituse[PM_ISU0] = 0;
359 }
360 /* Set TTM[01]SEL fields. */
361 ttmuse = 0;
362 for (i = PM_FPU; i <= PM_ISU1; ++i) {
363 if (!unituse[i])
364 continue;
365 if (ttmuse++)
366 return -1;
367 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
368 }
369 ttmuse = 0;
370 for (; i <= PM_GRS; ++i) {
371 if (!unituse[i])
372 continue;
373 if (ttmuse++)
374 return -1;
375 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
376 }
377 if (ttmuse > 1)
378 return -1;
379
380 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
381 for (byte = 0; byte < 4; ++byte) {
382 unit = busbyte[byte];
383 if (!unit)
384 continue;
385 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
386 /* get ISU0 through TTM1 rather than TTM0 */
387 unit = PM_ISU0_ALT;
388 } else if (unit == PM_LSU1 + 1) {
389 /* select lower word of LSU1 for this byte */
390 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
391 }
392 ttm = unit >> 2;
393 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
394 }
395
396 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
397 for (i = 0; i < n_ev; ++i) {
398 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
399 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
400 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
401 psel = event[i] & PM_PMCSEL_MSK;
402 isbus = event[i] & PM_BUSEVENT_MSK;
403 if (!pmc) {
404 /* Bus event or any-PMC direct event */
405 for (pmc = 0; pmc < 4; ++pmc) {
406 if (pmc_inuse & (1 << pmc))
407 continue;
408 grp = (pmc >> 1) & 1;
409 if (isbus) {
410 if (grp == (byte & 1))
411 break;
412 } else if (pmc_grp_use[grp] < 2) {
413 ++pmc_grp_use[grp];
414 break;
415 }
416 }
417 pmc_inuse |= 1 << pmc;
418 } else if (pmc <= 4) {
419 /* Direct event */
420 --pmc;
421 if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
422 /* add events on higher-numbered bus */
423 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
424 } else {
425 /* Instructions or run cycles on PMC5/6 */
426 --pmc;
427 }
428 if (isbus && unit == PM_GRS) {
429 bit = psel & 7;
430 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
431 mmcr1 |= (u64)grsel << grsel_shift[bit];
432 }
433 if (pmc <= 3)
434 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
435 hwc[i] = pmc;
436 }
437
438 /* Return MMCRx values */
439 mmcr[0] = 0;
440 if (pmc_inuse & 1)
441 mmcr[0] = MMCR0_PMC1CE;
442 if (pmc_inuse & 0x3e)
443 mmcr[0] |= MMCR0_PMCjCE;
444 mmcr[1] = mmcr1;
445 mmcr[2] = 0;
446 return 0;
447}
448
449static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
450{
451 if (pmc <= 3)
452 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
453}
454
455static int power5_generic_events[] = {
456 [PERF_COUNT_CPU_CYCLES] = 0xf,
457 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
458 [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
459 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
460 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
461 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
462};
463
464struct power_pmu power5_pmu = {
465 .n_counter = 6,
466 .max_alternatives = MAX_ALT,
467 .add_fields = 0x7000090000555ull,
468 .test_adder = 0x3000490000000ull,
469 .compute_mmcr = power5_compute_mmcr,
470 .get_constraint = power5_get_constraint,
471 .get_alternatives = power5_get_alternatives,
472 .disable_pmc = power5_disable_pmc,
473 .n_generic = ARRAY_SIZE(power5_generic_events),
474 .generic_events = power5_generic_events,
475};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..b1f61f3c97bb
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Assign PMC numbers and compute MMCR1 value for a set of events
53 */
54static int p6_compute_mmcr(unsigned int event[], int n_ev,
55 unsigned int hwc[], u64 mmcr[])
56{
57 u64 mmcr1 = 0;
58 int i;
59 unsigned int pmc, ev, b, u, s, psel;
60 unsigned int ttmset = 0;
61 unsigned int pmc_inuse = 0;
62
63 if (n_ev > 4)
64 return -1;
65 for (i = 0; i < n_ev; ++i) {
66 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
67 if (pmc) {
68 if (pmc_inuse & (1 << (pmc - 1)))
69 return -1; /* collision! */
70 pmc_inuse |= 1 << (pmc - 1);
71 }
72 }
73 for (i = 0; i < n_ev; ++i) {
74 ev = event[i];
75 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
76 if (pmc) {
77 --pmc;
78 } else {
79 /* can go on any PMC; find a free one */
80 for (pmc = 0; pmc < 4; ++pmc)
81 if (!(pmc_inuse & (1 << pmc)))
82 break;
83 pmc_inuse |= 1 << pmc;
84 }
85 hwc[i] = pmc;
86 psel = ev & PM_PMCSEL_MSK;
87 if (ev & PM_BUSEVENT_MSK) {
88 /* this event uses the event bus */
89 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
90 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
91 /* check for conflict on this byte of event bus */
92 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
93 return -1;
94 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
95 ttmset |= 1 << b;
96 if (u == 5) {
97 /* Nest events have a further mux */
98 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
99 if ((ttmset & 0x10) &&
100 MMCR1_NESTSEL(mmcr1) != s)
101 return -1;
102 ttmset |= 0x10;
103 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
104 }
105 if (0x30 <= psel && psel <= 0x3d) {
106 /* these need the PMCx_ADDR_SEL bits */
107 if (b >= 2)
108 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
109 }
110 /* bus select values are different for PMC3/4 */
111 if (pmc >= 2 && (psel & 0x90) == 0x80)
112 psel ^= 0x20;
113 }
114 if (ev & PM_LLA) {
115 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
116 if (ev & PM_LLAV)
117 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
118 }
119 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
120 }
121 mmcr[0] = 0;
122 if (pmc_inuse & 1)
123 mmcr[0] = MMCR0_PMC1CE;
124 if (pmc_inuse & 0xe)
125 mmcr[0] |= MMCR0_PMCjCE;
126 mmcr[1] = mmcr1;
127 mmcr[2] = 0;
128 return 0;
129}
130
131/*
132 * Layout of constraint bits:
133 *
134 * 0-1 add field: number of uses of PMC1 (max 1)
135 * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4
136 * 8-10 select field: nest (subunit) event selector
137 * 16-19 select field: unit on byte 0 of event bus
138 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
139 */
140static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
141{
142 int pmc, byte, sh;
143 unsigned int mask = 0, value = 0;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 4)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 }
153 if (event & PM_BUSEVENT_MSK) {
154 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
155 sh = byte * 4;
156 mask |= PM_UNIT_MSKS << sh;
157 value |= (event & PM_UNIT_MSKS) << sh;
158 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
159 mask |= PM_SUBUNIT_MSKS;
160 value |= event & PM_SUBUNIT_MSKS;
161 }
162 }
163 *maskp = mask;
164 *valp = value;
165 return 0;
166}
167
168#define MAX_ALT 4 /* at most 4 alternatives for any event */
169
170static const unsigned int event_alternatives[][MAX_ALT] = {
171 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
172 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
173 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
174 { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */
175 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
176 { 0x10000e, 0x400010 }, /* PM_PURR */
177 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
178 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
179 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
180 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
181 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
182 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
183 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
184 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
185 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
186 { 0x200012, 0x300012 }, /* PM_INST_DISP */
187 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
188 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
189 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
190 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
191 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
192 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
193 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
194};
195
196/*
197 * This could be made more efficient with a binary search on
198 * a presorted list, if necessary
199 */
200static int find_alternatives_list(unsigned int event)
201{
202 int i, j;
203 unsigned int alt;
204
205 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
206 if (event < event_alternatives[i][0])
207 return -1;
208 for (j = 0; j < MAX_ALT; ++j) {
209 alt = event_alternatives[i][j];
210 if (!alt || event < alt)
211 break;
212 if (event == alt)
213 return i;
214 }
215 }
216 return -1;
217}
218
219static int p6_get_alternatives(unsigned int event, unsigned int alt[])
220{
221 int i, j;
222 unsigned int aevent, psel, pmc;
223 unsigned int nalt = 1;
224
225 alt[0] = event;
226
227 /* check the alternatives table */
228 i = find_alternatives_list(event);
229 if (i >= 0) {
230 /* copy out alternatives from list */
231 for (j = 0; j < MAX_ALT; ++j) {
232 aevent = event_alternatives[i][j];
233 if (!aevent)
234 break;
235 if (aevent != event)
236 alt[nalt++] = aevent;
237 }
238
239 } else {
240 /* Check for alternative ways of computing sum events */
241 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
242 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
243 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
244 if (pmc && (psel == 0x32 || psel == 0x34))
245 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
246 ((5 - pmc) << PM_PMC_SH);
247
248 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
249 if (pmc && (psel == 0x38 || psel == 0x3a))
250 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
251 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
252 }
253
254 return nalt;
255}
256
257static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
258{
259 /* Set PMCxSEL to 0 to disable PMCx */
260 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
261}
262
263static int power6_generic_events[] = {
264 [PERF_COUNT_CPU_CYCLES] = 0x1e,
265 [PERF_COUNT_INSTRUCTIONS] = 2,
266 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
267 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
268 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
269 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
270};
271
272struct power_pmu power6_pmu = {
273 .n_counter = 4,
274 .max_alternatives = MAX_ALT,
275 .add_fields = 0x55,
276 .test_adder = 0,
277 .compute_mmcr = p6_compute_mmcr,
278 .get_constraint = p6_get_constraint,
279 .get_alternatives = p6_get_alternatives,
280 .disable_pmc = p6_disable_pmc,
281 .n_generic = ARRAY_SIZE(power6_generic_events),
282 .generic_events = power6_generic_events,
283};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..c3256580be1a
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_BYTE_SH 4 /* Byte number of event bus to use */
23#define PM_BYTE_MSK 3
24#define PM_PMCSEL_MSK 0xf
25
26/* Values in PM_UNIT field */
27#define PM_NONE 0
28#define PM_FPU 1
29#define PM_VPU 2
30#define PM_ISU 3
31#define PM_IFU 4
32#define PM_IDU 5
33#define PM_STS 6
34#define PM_LSU0 7
35#define PM_LSU1U 8
36#define PM_LSU1L 9
37#define PM_LASTUNIT 9
38
39/*
40 * Bits in MMCR0 for PPC970
41 */
42#define MMCR0_PMC1SEL_SH 8
43#define MMCR0_PMC2SEL_SH 1
44#define MMCR_PMCSEL_MSK 0x1f
45
46/*
47 * Bits in MMCR1 for PPC970
48 */
49#define MMCR1_TTM0SEL_SH 62
50#define MMCR1_TTM1SEL_SH 59
51#define MMCR1_TTM3SEL_SH 53
52#define MMCR1_TTMSEL_MSK 3
53#define MMCR1_TD_CP_DBG0SEL_SH 50
54#define MMCR1_TD_CP_DBG1SEL_SH 48
55#define MMCR1_TD_CP_DBG2SEL_SH 46
56#define MMCR1_TD_CP_DBG3SEL_SH 44
57#define MMCR1_PMC1_ADDER_SEL_SH 39
58#define MMCR1_PMC2_ADDER_SEL_SH 38
59#define MMCR1_PMC6_ADDER_SEL_SH 37
60#define MMCR1_PMC5_ADDER_SEL_SH 36
61#define MMCR1_PMC8_ADDER_SEL_SH 35
62#define MMCR1_PMC7_ADDER_SEL_SH 34
63#define MMCR1_PMC3_ADDER_SEL_SH 33
64#define MMCR1_PMC4_ADDER_SEL_SH 32
65#define MMCR1_PMC3SEL_SH 27
66#define MMCR1_PMC4SEL_SH 22
67#define MMCR1_PMC5SEL_SH 17
68#define MMCR1_PMC6SEL_SH 12
69#define MMCR1_PMC7SEL_SH 7
70#define MMCR1_PMC8SEL_SH 2
71
72static short mmcr1_adder_bits[8] = {
73 MMCR1_PMC1_ADDER_SEL_SH,
74 MMCR1_PMC2_ADDER_SEL_SH,
75 MMCR1_PMC3_ADDER_SEL_SH,
76 MMCR1_PMC4_ADDER_SEL_SH,
77 MMCR1_PMC5_ADDER_SEL_SH,
78 MMCR1_PMC6_ADDER_SEL_SH,
79 MMCR1_PMC7_ADDER_SEL_SH,
80 MMCR1_PMC8_ADDER_SEL_SH
81};
82
83/*
84 * Bits in MMCRA
85 */
86
87/*
88 * Layout of constraint bits:
89 * 6666555555555544444444443333333333222222222211111111110000000000
90 * 3210987654321098765432109876543210987654321098765432109876543210
91 * <><>[ >[ >[ >< >< >< >< ><><><><><><><><>
92 * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
93 *
94 * T0 - TTM0 constraint
95 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
96 *
97 * T1 - TTM1 constraint
98 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
99 *
100 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
101 * 43: UC3 error 0x0800_0000_0000
102 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
103 * 41: ISU events needed 0x0200_0000_0000
104 * 40: IDU|STS events needed 0x0100_0000_0000
105 *
106 * PS1
107 * 39: PS1 error 0x0080_0000_0000
108 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
109 *
110 * PS2
111 * 35: PS2 error 0x0008_0000_0000
112 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
113 *
114 * B0
115 * 28-31: Byte 0 event source 0xf000_0000
116 * Encoding as for the event code
117 *
118 * B1, B2, B3
119 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
120 *
121 * P1
122 * 15: P1 error 0x8000
123 * 14-15: Count of events needing PMC1
124 *
125 * P2..P8
126 * 0-13: Count of events needing PMC2..PMC8
127 */
128
129/* Masks and values for using events from the various units */
130static u64 unit_cons[PM_LASTUNIT+1][2] = {
131 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
132 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
133 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
134 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
135 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
136 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
137};
138
139static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
140{
141 int pmc, byte, unit, sh;
142 u64 mask = 0, value = 0;
143 int grp = -1;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 8)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 grp = ((pmc - 1) >> 1) & 1;
153 }
154 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
155 if (unit) {
156 if (unit > PM_LASTUNIT)
157 return -1;
158 mask |= unit_cons[unit][0];
159 value |= unit_cons[unit][1];
160 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
161 /*
162 * Bus events on bytes 0 and 2 can be counted
163 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
164 */
165 if (!pmc)
166 grp = byte & 1;
167 /* Set byte lane select field */
168 mask |= 0xfULL << (28 - 4 * byte);
169 value |= (u64)unit << (28 - 4 * byte);
170 }
171 if (grp == 0) {
172 /* increment PMC1/2/5/6 field */
173 mask |= 0x8000000000ull;
174 value |= 0x1000000000ull;
175 } else if (grp == 1) {
176 /* increment PMC3/4/7/8 field */
177 mask |= 0x800000000ull;
178 value |= 0x100000000ull;
179 }
180 *maskp = mask;
181 *valp = value;
182 return 0;
183}
184
185static int p970_get_alternatives(unsigned int event, unsigned int alt[])
186{
187 alt[0] = event;
188
189 /* 2 alternatives for LSU empty */
190 if (event == 0x2002 || event == 0x3002) {
191 alt[1] = event ^ 0x1000;
192 return 2;
193 }
194
195 return 1;
196}
197
198static int p970_compute_mmcr(unsigned int event[], int n_ev,
199 unsigned int hwc[], u64 mmcr[])
200{
201 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
202 unsigned int pmc, unit, byte, psel;
203 unsigned int ttm, grp;
204 unsigned int pmc_inuse = 0;
205 unsigned int pmc_grp_use[2];
206 unsigned char busbyte[4];
207 unsigned char unituse[16];
208 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
209 unsigned char ttmuse[2];
210 unsigned char pmcsel[8];
211 int i;
212
213 if (n_ev > 8)
214 return -1;
215
216 /* First pass to count resource use */
217 pmc_grp_use[0] = pmc_grp_use[1] = 0;
218 memset(busbyte, 0, sizeof(busbyte));
219 memset(unituse, 0, sizeof(unituse));
220 for (i = 0; i < n_ev; ++i) {
221 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
222 if (pmc) {
223 if (pmc_inuse & (1 << (pmc - 1)))
224 return -1;
225 pmc_inuse |= 1 << (pmc - 1);
226 /* count 1/2/5/6 vs 3/4/7/8 use */
227 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
228 }
229 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
230 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
231 if (unit) {
232 if (unit > PM_LASTUNIT)
233 return -1;
234 if (!pmc)
235 ++pmc_grp_use[byte & 1];
236 if (busbyte[byte] && busbyte[byte] != unit)
237 return -1;
238 busbyte[byte] = unit;
239 unituse[unit] = 1;
240 }
241 }
242 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
243 return -1;
244
245 /*
246 * Assign resources and set multiplexer selects.
247 *
248 * PM_ISU can go either on TTM0 or TTM1, but that's the only
249 * choice we have to deal with.
250 */
251 if (unituse[PM_ISU] &
252 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
253 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
254 /* Set TTM[01]SEL fields. */
255 ttmuse[0] = ttmuse[1] = 0;
256 for (i = PM_FPU; i <= PM_STS; ++i) {
257 if (!unituse[i])
258 continue;
259 ttm = unitmap[i];
260 ++ttmuse[(ttm >> 2) & 1];
261 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
262 }
263 /* Check only one unit per TTMx */
264 if (ttmuse[0] > 1 || ttmuse[1] > 1)
265 return -1;
266
267 /* Set byte lane select fields and TTM3SEL. */
268 for (byte = 0; byte < 4; ++byte) {
269 unit = busbyte[byte];
270 if (!unit)
271 continue;
272 if (unit <= PM_STS)
273 ttm = (unitmap[unit] >> 2) & 1;
274 else if (unit == PM_LSU0)
275 ttm = 2;
276 else {
277 ttm = 3;
278 if (unit == PM_LSU1L && byte >= 2)
279 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
280 }
281 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
282 }
283
284 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
285 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
286 for (i = 0; i < n_ev; ++i) {
287 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
288 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
289 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
290 psel = event[i] & PM_PMCSEL_MSK;
291 if (!pmc) {
292 /* Bus event or any-PMC direct event */
293 if (unit)
294 psel |= 0x10 | ((byte & 2) << 2);
295 else
296 psel |= 8;
297 for (pmc = 0; pmc < 8; ++pmc) {
298 if (pmc_inuse & (1 << pmc))
299 continue;
300 grp = (pmc >> 1) & 1;
301 if (unit) {
302 if (grp == (byte & 1))
303 break;
304 } else if (pmc_grp_use[grp] < 4) {
305 ++pmc_grp_use[grp];
306 break;
307 }
308 }
309 pmc_inuse |= 1 << pmc;
310 } else {
311 /* Direct event */
312 --pmc;
313 if (psel == 0 && (byte & 2))
314 /* add events on higher-numbered bus */
315 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
316 }
317 pmcsel[pmc] = psel;
318 hwc[i] = pmc;
319 }
320 for (pmc = 0; pmc < 2; ++pmc)
321 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
322 for (; pmc < 8; ++pmc)
323 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
324 if (pmc_inuse & 1)
325 mmcr0 |= MMCR0_PMC1CE;
326 if (pmc_inuse & 0xfe)
327 mmcr0 |= MMCR0_PMCjCE;
328
329 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
330
331 /* Return MMCRx values */
332 mmcr[0] = mmcr0;
333 mmcr[1] = mmcr1;
334 mmcr[2] = mmcra;
335 return 0;
336}
337
338static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
339{
340 int shift, i;
341
342 if (pmc <= 1) {
343 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
344 i = 0;
345 } else {
346 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
347 i = 1;
348 }
349 /*
350 * Setting the PMCxSEL field to 0x08 disables PMC x.
351 */
352 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
353}
354
355static int ppc970_generic_events[] = {
356 [PERF_COUNT_CPU_CYCLES] = 7,
357 [PERF_COUNT_INSTRUCTIONS] = 1,
358 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
359 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
360 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
361 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
362};
363
364struct power_pmu ppc970_pmu = {
365 .n_counter = 8,
366 .max_alternatives = 2,
367 .add_fields = 0x001100005555ull,
368 .test_adder = 0x013300000000ull,
369 .compute_mmcr = p970_compute_mmcr,
370 .get_constraint = p970_get_constraint,
371 .get_alternatives = p970_get_alternatives,
372 .disable_pmc = p970_disable_pmc,
373 .n_generic = ARRAY_SIZE(ppc970_generic_events),
374 .generic_events = ppc970_generic_events,
375};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac9..17bbf6f91fbe 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_counter.h>
32 33
33#include <asm/firmware.h> 34#include <asm/firmware.h>
34#include <asm/page.h> 35#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
170 die("Weird page fault", regs, SIGSEGV); 171 die("Weird page fault", regs, SIGSEGV);
171 } 172 }
172 173
174 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
175
173 /* When running in the kernel we expect faults to occur only to 176 /* When running in the kernel we expect faults to occur only to
174 * addresses in user space. All other faults represent errors in the 177 * addresses in user space. All other faults represent errors in the
175 * kernel and should generate an OOPS. Unfortunately, in the case of an 178 * kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,7 @@ good_area:
309 } 312 }
310 if (ret & VM_FAULT_MAJOR) { 313 if (ret & VM_FAULT_MAJOR) {
311 current->maj_flt++; 314 current->maj_flt++;
315 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
312#ifdef CONFIG_PPC_SMLPAR 316#ifdef CONFIG_PPC_SMLPAR
313 if (firmware_has_feature(FW_FEATURE_CMO)) { 317 if (firmware_has_feature(FW_FEATURE_CMO)) {
314 preempt_disable(); 318 preempt_disable();
@@ -316,8 +320,10 @@ good_area:
316 preempt_enable(); 320 preempt_enable();
317 } 321 }
318#endif 322#endif
319 } else 323 } else {
320 current->min_flt++; 324 current->min_flt++;
325 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
326 }
321 up_read(&mm->mmap_sem); 327 up_read(&mm->mmap_sem);
322 return 0; 328 return 0;
323 329
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e49337..732ee93a8e98 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b3408206091..6da24fc6a09e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -728,6 +728,7 @@ config X86_UP_IOAPIC
728config X86_LOCAL_APIC 728config X86_LOCAL_APIC
729 def_bool y 729 def_bool y
730 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 730 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
731 select HAVE_PERF_COUNTERS if (!M386 && !M486)
731 732
732config X86_IO_APIC 733config X86_IO_APIC
733 def_bool y 734 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..19c61ef6ab57 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,10 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad sys_perf_counter_open
833ia32_syscall_end: 834ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..fe24d2802490 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 50
51#ifdef CONFIG_PERF_COUNTERS 51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 039db6aa8e02..f5ebe2aaca4b 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..7309c0ad6902 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,9 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_counter_interrupt(void);
33extern void perf_pending_interrupt(void);
34
32extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..545bb811ccb5 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
117#define GENERIC_INTERRUPT_VECTOR 0xed 117#define GENERIC_INTERRUPT_VECTOR 0xed
118 118
119/* 119/*
120 * Performance monitoring pending work vector:
121 */
122#define LOCAL_PENDING_VECTOR 0xec
123
124/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 125 * First APIC vector available to drivers: (vectors 0x30-0xee) we
121 * start at 0x31(0x41) to spread out vectors evenly between priority 126 * start at 0x31(0x41) to spread out vectors evenly between priority
122 * levels. (0x80 is the syscall vector) 127 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..d08dd52cb8ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(int nmi);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(int nmi) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..0b4d8c2b157d 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_perf_counter_open 333
343 344
344#ifdef __KERNEL__ 345#ifdef __KERNEL__
345 346
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..d9aad876ad76 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_perf_counter_open 295
661__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 662
662#ifndef __NO_STUBS 663#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 664#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 098ec84b8c00..fb504f843e58 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36 36
37#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mpspec.h> 40#include <asm/mpspec.h>
@@ -755,6 +756,8 @@ static void local_apic_timer_interrupt(void)
755 inc_irq_stat(apic_timer_irqs); 756 inc_irq_stat(apic_timer_irqs);
756 757
757 evt->event_handler(evt); 758 evt->event_handler(evt);
759
760 perf_counter_unthrottle();
758} 761}
759 762
760/* 763/*
@@ -1127,6 +1130,7 @@ void __cpuinit setup_local_APIC(void)
1127 apic_write(APIC_ESR, 0); 1130 apic_write(APIC_ESR, 0);
1128 } 1131 }
1129#endif 1132#endif
1133 perf_counters_lapic_init(0);
1130 1134
1131 preempt_disable(); 1135 preempt_disable();
1132 1136
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa64..fd69c514ca2a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,6 +420,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
420 if (c->x86 >= 6) 420 if (c->x86 >= 6)
421 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 421 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
422 422
423 /* Enable Performance counter for K7 and later */
424 if (c->x86 > 6 && c->x86 <= 0x11)
425 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
426
423 if (!c->x86_model_id[0]) { 427 if (!c->x86_model_id[0]) {
424 switch (c->x86) { 428 switch (c->x86) {
425 case 0xf: 429 case 0xf:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..a86769efe0df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -854,6 +855,7 @@ void __init identify_boot_cpu(void)
854#else 855#else
855 vgetcpu_set_mode(); 856 vgetcpu_set_mode();
856#endif 857#endif
858 init_hw_perf_counters();
857} 859}
858 860
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 861void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..1116a41bc7b5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1213 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2009 Jaswinder Singh Rajput
7 *
8 * For licencing details see kernel-base/COPYING
9 */
10
11#include <linux/perf_counter.h>
12#include <linux/capability.h>
13#include <linux/notifier.h>
14#include <linux/hardirq.h>
15#include <linux/kprobes.h>
16#include <linux/module.h>
17#include <linux/kdebug.h>
18#include <linux/sched.h>
19#include <linux/uaccess.h>
20
21#include <asm/apic.h>
22#include <asm/stacktrace.h>
23#include <asm/nmi.h>
24
25static bool perf_counters_initialized __read_mostly;
26
27/*
28 * Number of (generic) HW counters:
29 */
30static int nr_counters_generic __read_mostly;
31static u64 perf_counter_mask __read_mostly;
32static u64 counter_value_mask __read_mostly;
33static int counter_value_bits __read_mostly;
34
35static int nr_counters_fixed __read_mostly;
36
37struct cpu_hw_counters {
38 struct perf_counter *counters[X86_PMC_IDX_MAX];
39 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
40 unsigned long interrupts;
41 u64 throttle_ctrl;
42 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
43 int enabled;
44};
45
46/*
47 * struct pmc_x86_ops - performance counter x86 ops
48 */
49struct pmc_x86_ops {
50 u64 (*save_disable_all)(void);
51 void (*restore_all)(u64);
52 u64 (*get_status)(u64);
53 void (*ack_status)(u64);
54 void (*enable)(int, u64);
55 void (*disable)(int, u64);
56 unsigned eventsel;
57 unsigned perfctr;
58 u64 (*event_map)(int);
59 u64 (*raw_event)(u64);
60 int max_events;
61};
62
63static struct pmc_x86_ops *pmc_ops __read_mostly;
64
65static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66 .enabled = 1,
67};
68
69static __read_mostly int intel_perfmon_version;
70
71/*
72 * Intel PerfMon v3. Used on Core2 and later.
73 */
74static const u64 intel_perfmon_event_map[] =
75{
76 [PERF_COUNT_CPU_CYCLES] = 0x003c,
77 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
78 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
79 [PERF_COUNT_CACHE_MISSES] = 0x412e,
80 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
81 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
82 [PERF_COUNT_BUS_CYCLES] = 0x013c,
83};
84
85static u64 pmc_intel_event_map(int event)
86{
87 return intel_perfmon_event_map[event];
88}
89
90static u64 pmc_intel_raw_event(u64 event)
91{
92#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
93#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
94#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
95
96#define CORE_EVNTSEL_MASK \
97 (CORE_EVNTSEL_EVENT_MASK | \
98 CORE_EVNTSEL_UNIT_MASK | \
99 CORE_EVNTSEL_COUNTER_MASK)
100
101 return event & CORE_EVNTSEL_MASK;
102}
103
104/*
105 * AMD Performance Monitor K7 and later.
106 */
107static const u64 amd_perfmon_event_map[] =
108{
109 [PERF_COUNT_CPU_CYCLES] = 0x0076,
110 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
111 [PERF_COUNT_CACHE_REFERENCES] = 0x0080,
112 [PERF_COUNT_CACHE_MISSES] = 0x0081,
113 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
114 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
115};
116
117static u64 pmc_amd_event_map(int event)
118{
119 return amd_perfmon_event_map[event];
120}
121
122static u64 pmc_amd_raw_event(u64 event)
123{
124#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
125#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
126#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
127
128#define K7_EVNTSEL_MASK \
129 (K7_EVNTSEL_EVENT_MASK | \
130 K7_EVNTSEL_UNIT_MASK | \
131 K7_EVNTSEL_COUNTER_MASK)
132
133 return event & K7_EVNTSEL_MASK;
134}
135
136/*
137 * Propagate counter elapsed time into the generic counter.
138 * Can only be executed on the CPU where the counter is active.
139 * Returns the delta events processed.
140 */
141static void
142x86_perf_counter_update(struct perf_counter *counter,
143 struct hw_perf_counter *hwc, int idx)
144{
145 u64 prev_raw_count, new_raw_count, delta;
146
147 /*
148 * Careful: an NMI might modify the previous counter value.
149 *
150 * Our tactic to handle this is to first atomically read and
151 * exchange a new raw count - then add that new-prev delta
152 * count to the generic counter atomically:
153 */
154again:
155 prev_raw_count = atomic64_read(&hwc->prev_count);
156 rdmsrl(hwc->counter_base + idx, new_raw_count);
157
158 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
159 new_raw_count) != prev_raw_count)
160 goto again;
161
162 /*
163 * Now we have the new raw value and have updated the prev
164 * timestamp already. We can now calculate the elapsed delta
165 * (counter-)time and add that to the generic counter.
166 *
167 * Careful, not all hw sign-extends above the physical width
168 * of the count, so we do that by clipping the delta to 32 bits:
169 */
170 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
171
172 atomic64_add(delta, &counter->count);
173 atomic64_sub(delta, &hwc->period_left);
174}
175
176static atomic_t num_counters;
177static DEFINE_MUTEX(pmc_reserve_mutex);
178
179static bool reserve_pmc_hardware(void)
180{
181 int i;
182
183 if (nmi_watchdog == NMI_LOCAL_APIC)
184 disable_lapic_nmi_watchdog();
185
186 for (i = 0; i < nr_counters_generic; i++) {
187 if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
188 goto perfctr_fail;
189 }
190
191 for (i = 0; i < nr_counters_generic; i++) {
192 if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
193 goto eventsel_fail;
194 }
195
196 return true;
197
198eventsel_fail:
199 for (i--; i >= 0; i--)
200 release_evntsel_nmi(pmc_ops->eventsel + i);
201
202 i = nr_counters_generic;
203
204perfctr_fail:
205 for (i--; i >= 0; i--)
206 release_perfctr_nmi(pmc_ops->perfctr + i);
207
208 if (nmi_watchdog == NMI_LOCAL_APIC)
209 enable_lapic_nmi_watchdog();
210
211 return false;
212}
213
214static void release_pmc_hardware(void)
215{
216 int i;
217
218 for (i = 0; i < nr_counters_generic; i++) {
219 release_perfctr_nmi(pmc_ops->perfctr + i);
220 release_evntsel_nmi(pmc_ops->eventsel + i);
221 }
222
223 if (nmi_watchdog == NMI_LOCAL_APIC)
224 enable_lapic_nmi_watchdog();
225}
226
227static void hw_perf_counter_destroy(struct perf_counter *counter)
228{
229 if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
230 release_pmc_hardware();
231 mutex_unlock(&pmc_reserve_mutex);
232 }
233}
234
235/*
236 * Setup the hardware configuration for a given hw_event_type
237 */
238static int __hw_perf_counter_init(struct perf_counter *counter)
239{
240 struct perf_counter_hw_event *hw_event = &counter->hw_event;
241 struct hw_perf_counter *hwc = &counter->hw;
242 int err;
243
244 if (unlikely(!perf_counters_initialized))
245 return -EINVAL;
246
247 err = 0;
248 if (atomic_inc_not_zero(&num_counters)) {
249 mutex_lock(&pmc_reserve_mutex);
250 if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
251 err = -EBUSY;
252 else
253 atomic_inc(&num_counters);
254 mutex_unlock(&pmc_reserve_mutex);
255 }
256 if (err)
257 return err;
258
259 /*
260 * Generate PMC IRQs:
261 * (keep 'enabled' bit clear for now)
262 */
263 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
264
265 /*
266 * Count user and OS events unless requested not to.
267 */
268 if (!hw_event->exclude_user)
269 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
270 if (!hw_event->exclude_kernel)
271 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
272
273 /*
274 * If privileged enough, allow NMI events:
275 */
276 hwc->nmi = 0;
277 if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
278 hwc->nmi = 1;
279
280 hwc->irq_period = hw_event->irq_period;
281 /*
282 * Intel PMCs cannot be accessed sanely above 32 bit width,
283 * so we install an artificial 1<<31 period regardless of
284 * the generic counter period:
285 */
286 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
287 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
288 hwc->irq_period = 0x7FFFFFFF;
289
290 atomic64_set(&hwc->period_left, hwc->irq_period);
291
292 /*
293 * Raw event type provide the config in the event structure
294 */
295 if (perf_event_raw(hw_event)) {
296 hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
297 } else {
298 if (perf_event_id(hw_event) >= pmc_ops->max_events)
299 return -EINVAL;
300 /*
301 * The generic map:
302 */
303 hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
304 }
305
306 counter->destroy = hw_perf_counter_destroy;
307
308 return 0;
309}
310
311static u64 pmc_intel_save_disable_all(void)
312{
313 u64 ctrl;
314
315 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
316 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
317
318 return ctrl;
319}
320
321static u64 pmc_amd_save_disable_all(void)
322{
323 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
324 int enabled, idx;
325
326 enabled = cpuc->enabled;
327 cpuc->enabled = 0;
328 /*
329 * ensure we write the disable before we start disabling the
330 * counters proper, so that pcm_amd_enable() does the right thing.
331 */
332 barrier();
333
334 for (idx = 0; idx < nr_counters_generic; idx++) {
335 u64 val;
336
337 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
338 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
339 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
340 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
341 }
342 }
343
344 return enabled;
345}
346
347u64 hw_perf_save_disable(void)
348{
349 if (unlikely(!perf_counters_initialized))
350 return 0;
351
352 return pmc_ops->save_disable_all();
353}
354/*
355 * Exported because of ACPI idle
356 */
357EXPORT_SYMBOL_GPL(hw_perf_save_disable);
358
359static void pmc_intel_restore_all(u64 ctrl)
360{
361 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
362}
363
364static void pmc_amd_restore_all(u64 ctrl)
365{
366 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
367 int idx;
368
369 cpuc->enabled = ctrl;
370 barrier();
371 if (!ctrl)
372 return;
373
374 for (idx = 0; idx < nr_counters_generic; idx++) {
375 if (test_bit(idx, cpuc->active_mask)) {
376 u64 val;
377
378 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
379 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
380 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
381 }
382 }
383}
384
385void hw_perf_restore(u64 ctrl)
386{
387 if (unlikely(!perf_counters_initialized))
388 return;
389
390 pmc_ops->restore_all(ctrl);
391}
392/*
393 * Exported because of ACPI idle
394 */
395EXPORT_SYMBOL_GPL(hw_perf_restore);
396
397static u64 pmc_intel_get_status(u64 mask)
398{
399 u64 status;
400
401 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
402
403 return status;
404}
405
406static u64 pmc_amd_get_status(u64 mask)
407{
408 u64 status = 0;
409 int idx;
410
411 for (idx = 0; idx < nr_counters_generic; idx++) {
412 s64 val;
413
414 if (!(mask & (1 << idx)))
415 continue;
416
417 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
418 val <<= (64 - counter_value_bits);
419 if (val >= 0)
420 status |= (1 << idx);
421 }
422
423 return status;
424}
425
426static u64 hw_perf_get_status(u64 mask)
427{
428 if (unlikely(!perf_counters_initialized))
429 return 0;
430
431 return pmc_ops->get_status(mask);
432}
433
434static void pmc_intel_ack_status(u64 ack)
435{
436 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
437}
438
439static void pmc_amd_ack_status(u64 ack)
440{
441}
442
443static void hw_perf_ack_status(u64 ack)
444{
445 if (unlikely(!perf_counters_initialized))
446 return;
447
448 pmc_ops->ack_status(ack);
449}
450
451static void pmc_intel_enable(int idx, u64 config)
452{
453 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
454 config | ARCH_PERFMON_EVENTSEL0_ENABLE);
455}
456
457static void pmc_amd_enable(int idx, u64 config)
458{
459 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
460
461 set_bit(idx, cpuc->active_mask);
462 if (cpuc->enabled)
463 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
464
465 wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
466}
467
468static void hw_perf_enable(int idx, u64 config)
469{
470 if (unlikely(!perf_counters_initialized))
471 return;
472
473 pmc_ops->enable(idx, config);
474}
475
476static void pmc_intel_disable(int idx, u64 config)
477{
478 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
479}
480
481static void pmc_amd_disable(int idx, u64 config)
482{
483 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
484
485 clear_bit(idx, cpuc->active_mask);
486 wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
487
488}
489
490static void hw_perf_disable(int idx, u64 config)
491{
492 if (unlikely(!perf_counters_initialized))
493 return;
494
495 pmc_ops->disable(idx, config);
496}
497
498static inline void
499__pmc_fixed_disable(struct perf_counter *counter,
500 struct hw_perf_counter *hwc, unsigned int __idx)
501{
502 int idx = __idx - X86_PMC_IDX_FIXED;
503 u64 ctrl_val, mask;
504 int err;
505
506 mask = 0xfULL << (idx * 4);
507
508 rdmsrl(hwc->config_base, ctrl_val);
509 ctrl_val &= ~mask;
510 err = checking_wrmsrl(hwc->config_base, ctrl_val);
511}
512
513static inline void
514__pmc_generic_disable(struct perf_counter *counter,
515 struct hw_perf_counter *hwc, unsigned int idx)
516{
517 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
518 __pmc_fixed_disable(counter, hwc, idx);
519 else
520 hw_perf_disable(idx, hwc->config);
521}
522
523static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
524
525/*
526 * Set the next IRQ period, based on the hwc->period_left value.
527 * To be called with the counter disabled in hw:
528 */
529static void
530__hw_perf_counter_set_period(struct perf_counter *counter,
531 struct hw_perf_counter *hwc, int idx)
532{
533 s64 left = atomic64_read(&hwc->period_left);
534 s64 period = hwc->irq_period;
535 int err;
536
537 /*
538 * If we are way outside a reasoable range then just skip forward:
539 */
540 if (unlikely(left <= -period)) {
541 left = period;
542 atomic64_set(&hwc->period_left, left);
543 }
544
545 if (unlikely(left <= 0)) {
546 left += period;
547 atomic64_set(&hwc->period_left, left);
548 }
549
550 per_cpu(prev_left[idx], smp_processor_id()) = left;
551
552 /*
553 * The hw counter starts counting from this counter offset,
554 * mark it to be able to extra future deltas:
555 */
556 atomic64_set(&hwc->prev_count, (u64)-left);
557
558 err = checking_wrmsrl(hwc->counter_base + idx,
559 (u64)(-left) & counter_value_mask);
560}
561
562static inline void
563__pmc_fixed_enable(struct perf_counter *counter,
564 struct hw_perf_counter *hwc, unsigned int __idx)
565{
566 int idx = __idx - X86_PMC_IDX_FIXED;
567 u64 ctrl_val, bits, mask;
568 int err;
569
570 /*
571 * Enable IRQ generation (0x8),
572 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
573 * if requested:
574 */
575 bits = 0x8ULL;
576 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
577 bits |= 0x2;
578 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
579 bits |= 0x1;
580 bits <<= (idx * 4);
581 mask = 0xfULL << (idx * 4);
582
583 rdmsrl(hwc->config_base, ctrl_val);
584 ctrl_val &= ~mask;
585 ctrl_val |= bits;
586 err = checking_wrmsrl(hwc->config_base, ctrl_val);
587}
588
589static void
590__pmc_generic_enable(struct perf_counter *counter,
591 struct hw_perf_counter *hwc, int idx)
592{
593 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
594 __pmc_fixed_enable(counter, hwc, idx);
595 else
596 hw_perf_enable(idx, hwc->config);
597}
598
599static int
600fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
601{
602 unsigned int event;
603
604 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
605 return -1;
606
607 if (unlikely(hwc->nmi))
608 return -1;
609
610 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
611
612 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
613 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
614 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
615 return X86_PMC_IDX_FIXED_CPU_CYCLES;
616 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
617 return X86_PMC_IDX_FIXED_BUS_CYCLES;
618
619 return -1;
620}
621
622/*
623 * Find a PMC slot for the freshly enabled / scheduled in counter:
624 */
625static int pmc_generic_enable(struct perf_counter *counter)
626{
627 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
628 struct hw_perf_counter *hwc = &counter->hw;
629 int idx;
630
631 idx = fixed_mode_idx(counter, hwc);
632 if (idx >= 0) {
633 /*
634 * Try to get the fixed counter, if that is already taken
635 * then try to get a generic counter:
636 */
637 if (test_and_set_bit(idx, cpuc->used))
638 goto try_generic;
639
640 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
641 /*
642 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
643 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
644 */
645 hwc->counter_base =
646 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
647 hwc->idx = idx;
648 } else {
649 idx = hwc->idx;
650 /* Try to get the previous generic counter again */
651 if (test_and_set_bit(idx, cpuc->used)) {
652try_generic:
653 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
654 if (idx == nr_counters_generic)
655 return -EAGAIN;
656
657 set_bit(idx, cpuc->used);
658 hwc->idx = idx;
659 }
660 hwc->config_base = pmc_ops->eventsel;
661 hwc->counter_base = pmc_ops->perfctr;
662 }
663
664 perf_counters_lapic_init(hwc->nmi);
665
666 __pmc_generic_disable(counter, hwc, idx);
667
668 cpuc->counters[idx] = counter;
669 /*
670 * Make it visible before enabling the hw:
671 */
672 smp_wmb();
673
674 __hw_perf_counter_set_period(counter, hwc, idx);
675 __pmc_generic_enable(counter, hwc, idx);
676
677 return 0;
678}
679
680void perf_counter_print_debug(void)
681{
682 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
683 struct cpu_hw_counters *cpuc;
684 int cpu, idx;
685
686 if (!nr_counters_generic)
687 return;
688
689 local_irq_disable();
690
691 cpu = smp_processor_id();
692 cpuc = &per_cpu(cpu_hw_counters, cpu);
693
694 if (intel_perfmon_version >= 2) {
695 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
696 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
697 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
698 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
699
700 pr_info("\n");
701 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
702 pr_info("CPU#%d: status: %016llx\n", cpu, status);
703 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
704 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
705 }
706 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
707
708 for (idx = 0; idx < nr_counters_generic; idx++) {
709 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
710 rdmsrl(pmc_ops->perfctr + idx, pmc_count);
711
712 prev_left = per_cpu(prev_left[idx], cpu);
713
714 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
715 cpu, idx, pmc_ctrl);
716 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
717 cpu, idx, pmc_count);
718 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
719 cpu, idx, prev_left);
720 }
721 for (idx = 0; idx < nr_counters_fixed; idx++) {
722 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
723
724 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
725 cpu, idx, pmc_count);
726 }
727 local_irq_enable();
728}
729
730static void pmc_generic_disable(struct perf_counter *counter)
731{
732 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
733 struct hw_perf_counter *hwc = &counter->hw;
734 unsigned int idx = hwc->idx;
735
736 __pmc_generic_disable(counter, hwc, idx);
737
738 clear_bit(idx, cpuc->used);
739 cpuc->counters[idx] = NULL;
740 /*
741 * Make sure the cleared pointer becomes visible before we
742 * (potentially) free the counter:
743 */
744 smp_wmb();
745
746 /*
747 * Drain the remaining delta count out of a counter
748 * that we are disabling:
749 */
750 x86_perf_counter_update(counter, hwc, idx);
751}
752
753/*
754 * Save and restart an expired counter. Called by NMI contexts,
755 * so it has to be careful about preempting normal counter ops:
756 */
757static void perf_save_and_restart(struct perf_counter *counter)
758{
759 struct hw_perf_counter *hwc = &counter->hw;
760 int idx = hwc->idx;
761
762 x86_perf_counter_update(counter, hwc, idx);
763 __hw_perf_counter_set_period(counter, hwc, idx);
764
765 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
766 __pmc_generic_enable(counter, hwc, idx);
767}
768
769/*
770 * Maximum interrupt frequency of 100KHz per CPU
771 */
772#define PERFMON_MAX_INTERRUPTS (100000/HZ)
773
774/*
775 * This handler is triggered by the local APIC, so the APIC IRQ handling
776 * rules apply:
777 */
778static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
779{
780 int bit, cpu = smp_processor_id();
781 u64 ack, status;
782 struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
783 int ret = 0;
784
785 cpuc->throttle_ctrl = hw_perf_save_disable();
786
787 status = hw_perf_get_status(cpuc->throttle_ctrl);
788 if (!status)
789 goto out;
790
791 ret = 1;
792again:
793 inc_irq_stat(apic_perf_irqs);
794 ack = status;
795 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
796 struct perf_counter *counter = cpuc->counters[bit];
797
798 clear_bit(bit, (unsigned long *) &status);
799 if (!counter)
800 continue;
801
802 perf_save_and_restart(counter);
803 if (perf_counter_overflow(counter, nmi, regs))
804 __pmc_generic_disable(counter, &counter->hw, bit);
805 }
806
807 hw_perf_ack_status(ack);
808
809 /*
810 * Repeat if there is more work to be done:
811 */
812 status = hw_perf_get_status(cpuc->throttle_ctrl);
813 if (status)
814 goto again;
815out:
816 /*
817 * Restore - do not reenable when global enable is off or throttled:
818 */
819 if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
820 hw_perf_restore(cpuc->throttle_ctrl);
821
822 return ret;
823}
824
825void perf_counter_unthrottle(void)
826{
827 struct cpu_hw_counters *cpuc;
828
829 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
830 return;
831
832 if (unlikely(!perf_counters_initialized))
833 return;
834
835 cpuc = &__get_cpu_var(cpu_hw_counters);
836 if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
837 if (printk_ratelimit())
838 printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
839 hw_perf_restore(cpuc->throttle_ctrl);
840 }
841 cpuc->interrupts = 0;
842}
843
844void smp_perf_counter_interrupt(struct pt_regs *regs)
845{
846 irq_enter();
847 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
848 ack_APIC_irq();
849 __smp_perf_counter_interrupt(regs, 0);
850 irq_exit();
851}
852
853void smp_perf_pending_interrupt(struct pt_regs *regs)
854{
855 irq_enter();
856 ack_APIC_irq();
857 inc_irq_stat(apic_pending_irqs);
858 perf_counter_do_pending();
859 irq_exit();
860}
861
862void set_perf_counter_pending(void)
863{
864 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
865}
866
867void perf_counters_lapic_init(int nmi)
868{
869 u32 apic_val;
870
871 if (!perf_counters_initialized)
872 return;
873 /*
874 * Enable the performance counter vector in the APIC LVT:
875 */
876 apic_val = apic_read(APIC_LVTERR);
877
878 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
879 if (nmi)
880 apic_write(APIC_LVTPC, APIC_DM_NMI);
881 else
882 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
883 apic_write(APIC_LVTERR, apic_val);
884}
885
886static int __kprobes
887perf_counter_nmi_handler(struct notifier_block *self,
888 unsigned long cmd, void *__args)
889{
890 struct die_args *args = __args;
891 struct pt_regs *regs;
892 int ret;
893
894 switch (cmd) {
895 case DIE_NMI:
896 case DIE_NMI_IPI:
897 break;
898
899 default:
900 return NOTIFY_DONE;
901 }
902
903 regs = args->regs;
904
905 apic_write(APIC_LVTPC, APIC_DM_NMI);
906 ret = __smp_perf_counter_interrupt(regs, 1);
907
908 return ret ? NOTIFY_STOP : NOTIFY_OK;
909}
910
911static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
912 .notifier_call = perf_counter_nmi_handler,
913 .next = NULL,
914 .priority = 1
915};
916
917static struct pmc_x86_ops pmc_intel_ops = {
918 .save_disable_all = pmc_intel_save_disable_all,
919 .restore_all = pmc_intel_restore_all,
920 .get_status = pmc_intel_get_status,
921 .ack_status = pmc_intel_ack_status,
922 .enable = pmc_intel_enable,
923 .disable = pmc_intel_disable,
924 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
925 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
926 .event_map = pmc_intel_event_map,
927 .raw_event = pmc_intel_raw_event,
928 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
929};
930
931static struct pmc_x86_ops pmc_amd_ops = {
932 .save_disable_all = pmc_amd_save_disable_all,
933 .restore_all = pmc_amd_restore_all,
934 .get_status = pmc_amd_get_status,
935 .ack_status = pmc_amd_ack_status,
936 .enable = pmc_amd_enable,
937 .disable = pmc_amd_disable,
938 .eventsel = MSR_K7_EVNTSEL0,
939 .perfctr = MSR_K7_PERFCTR0,
940 .event_map = pmc_amd_event_map,
941 .raw_event = pmc_amd_raw_event,
942 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
943};
944
945static struct pmc_x86_ops *pmc_intel_init(void)
946{
947 union cpuid10_edx edx;
948 union cpuid10_eax eax;
949 unsigned int unused;
950 unsigned int ebx;
951
952 /*
953 * Check whether the Architectural PerfMon supports
954 * Branch Misses Retired Event or not.
955 */
956 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
957 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
958 return NULL;
959
960 intel_perfmon_version = eax.split.version_id;
961 if (intel_perfmon_version < 2)
962 return NULL;
963
964 pr_info("Intel Performance Monitoring support detected.\n");
965 pr_info("... version: %d\n", intel_perfmon_version);
966 pr_info("... bit width: %d\n", eax.split.bit_width);
967 pr_info("... mask length: %d\n", eax.split.mask_length);
968
969 nr_counters_generic = eax.split.num_counters;
970 nr_counters_fixed = edx.split.num_counters_fixed;
971 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
972
973 return &pmc_intel_ops;
974}
975
976static struct pmc_x86_ops *pmc_amd_init(void)
977{
978 nr_counters_generic = 4;
979 nr_counters_fixed = 0;
980 counter_value_mask = 0x0000FFFFFFFFFFFFULL;
981 counter_value_bits = 48;
982
983 pr_info("AMD Performance Monitoring support detected.\n");
984
985 return &pmc_amd_ops;
986}
987
988void __init init_hw_perf_counters(void)
989{
990 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
991 return;
992
993 switch (boot_cpu_data.x86_vendor) {
994 case X86_VENDOR_INTEL:
995 pmc_ops = pmc_intel_init();
996 break;
997 case X86_VENDOR_AMD:
998 pmc_ops = pmc_amd_init();
999 break;
1000 }
1001 if (!pmc_ops)
1002 return;
1003
1004 pr_info("... num counters: %d\n", nr_counters_generic);
1005 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
1006 nr_counters_generic = X86_PMC_MAX_GENERIC;
1007 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1008 nr_counters_generic, X86_PMC_MAX_GENERIC);
1009 }
1010 perf_counter_mask = (1 << nr_counters_generic) - 1;
1011 perf_max_counters = nr_counters_generic;
1012
1013 pr_info("... value mask: %016Lx\n", counter_value_mask);
1014
1015 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
1016 nr_counters_fixed = X86_PMC_MAX_FIXED;
1017 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1018 nr_counters_fixed, X86_PMC_MAX_FIXED);
1019 }
1020 pr_info("... fixed counters: %d\n", nr_counters_fixed);
1021
1022 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1023
1024 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1025 perf_counters_initialized = true;
1026
1027 perf_counters_lapic_init(0);
1028 register_die_notifier(&perf_counter_nmi_notifier);
1029}
1030
1031static void pmc_generic_read(struct perf_counter *counter)
1032{
1033 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1034}
1035
1036static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1037 .enable = pmc_generic_enable,
1038 .disable = pmc_generic_disable,
1039 .read = pmc_generic_read,
1040};
1041
1042const struct hw_perf_counter_ops *
1043hw_perf_counter_init(struct perf_counter *counter)
1044{
1045 int err;
1046
1047 err = __hw_perf_counter_init(counter);
1048 if (err)
1049 return ERR_PTR(err);
1050
1051 return &x86_perf_counter_ops;
1052}
1053
1054/*
1055 * callchain support
1056 */
1057
1058static inline
1059void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1060{
1061 if (entry->nr < MAX_STACK_DEPTH)
1062 entry->ip[entry->nr++] = ip;
1063}
1064
1065static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1066static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1067
1068
1069static void
1070backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1071{
1072 /* Ignore warnings */
1073}
1074
1075static void backtrace_warning(void *data, char *msg)
1076{
1077 /* Ignore warnings */
1078}
1079
1080static int backtrace_stack(void *data, char *name)
1081{
1082 /* Don't bother with IRQ stacks for now */
1083 return -1;
1084}
1085
1086static void backtrace_address(void *data, unsigned long addr, int reliable)
1087{
1088 struct perf_callchain_entry *entry = data;
1089
1090 if (reliable)
1091 callchain_store(entry, addr);
1092}
1093
1094static const struct stacktrace_ops backtrace_ops = {
1095 .warning = backtrace_warning,
1096 .warning_symbol = backtrace_warning_symbol,
1097 .stack = backtrace_stack,
1098 .address = backtrace_address,
1099};
1100
1101static void
1102perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1103{
1104 unsigned long bp;
1105 char *stack;
1106 int nr = entry->nr;
1107
1108 callchain_store(entry, instruction_pointer(regs));
1109
1110 stack = ((char *)regs + sizeof(struct pt_regs));
1111#ifdef CONFIG_FRAME_POINTER
1112 bp = frame_pointer(regs);
1113#else
1114 bp = 0;
1115#endif
1116
1117 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1118
1119 entry->kernel = entry->nr - nr;
1120}
1121
1122
1123struct stack_frame {
1124 const void __user *next_fp;
1125 unsigned long return_address;
1126};
1127
1128static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1129{
1130 int ret;
1131
1132 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1133 return 0;
1134
1135 ret = 1;
1136 pagefault_disable();
1137 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1138 ret = 0;
1139 pagefault_enable();
1140
1141 return ret;
1142}
1143
1144static void
1145perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1146{
1147 struct stack_frame frame;
1148 const void __user *fp;
1149 int nr = entry->nr;
1150
1151 regs = (struct pt_regs *)current->thread.sp0 - 1;
1152 fp = (void __user *)regs->bp;
1153
1154 callchain_store(entry, regs->ip);
1155
1156 while (entry->nr < MAX_STACK_DEPTH) {
1157 frame.next_fp = NULL;
1158 frame.return_address = 0;
1159
1160 if (!copy_stack_frame(fp, &frame))
1161 break;
1162
1163 if ((unsigned long)fp < user_stack_pointer(regs))
1164 break;
1165
1166 callchain_store(entry, frame.return_address);
1167 fp = frame.next_fp;
1168 }
1169
1170 entry->user = entry->nr - nr;
1171}
1172
1173static void
1174perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1175{
1176 int is_user;
1177
1178 if (!regs)
1179 return;
1180
1181 is_user = user_mode(regs);
1182
1183 if (!current || current->pid == 0)
1184 return;
1185
1186 if (is_user && current->state != TASK_RUNNING)
1187 return;
1188
1189 if (!is_user)
1190 perf_callchain_kernel(regs, entry);
1191
1192 if (current->mm)
1193 perf_callchain_user(regs, entry);
1194}
1195
1196struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1197{
1198 struct perf_callchain_entry *entry;
1199
1200 if (in_nmi())
1201 entry = &__get_cpu_var(nmi_entry);
1202 else
1203 entry = &__get_cpu_var(irq_entry);
1204
1205 entry->nr = 0;
1206 entry->hv = 0;
1207 entry->kernel = 0;
1208 entry->user = 0;
1209
1210 perf_do_callchain(regs, entry);
1211
1212 return entry;
1213}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a331ec38af9e..1d46cba56fd8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1025,6 +1025,13 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1026 spurious_interrupt smp_spurious_interrupt
1027 1027
1028#ifdef CONFIG_PERF_COUNTERS
1029apicinterrupt LOCAL_PERF_VECTOR \
1030 perf_counter_interrupt smp_perf_counter_interrupt
1031apicinterrupt LOCAL_PENDING_VECTOR \
1032 perf_pending_interrupt smp_perf_pending_interrupt
1033#endif
1034
1028/* 1035/*
1029 * Exception entry points. 1036 * Exception entry points.
1030 */ 1037 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3aaf7b9e3a8b..d465487da587 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "CNT: ");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "PND: ");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "PLT: "); 76 seq_printf(p, "PLT: ");
@@ -166,6 +174,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 174#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 175 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 176 sum += irq_stats(cpu)->irq_spurious_count;
177 sum += irq_stats(cpu)->apic_perf_irqs;
178 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 179#endif
170 if (generic_interrupt_extension) 180 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 181 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f9..3190a6b961e6 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -118,28 +118,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 118 return 0;
119} 119}
120 120
121/* Overridden in paravirt.c */ 121static void __init smp_intr_init(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 122{
126 int i;
127
128 /* Execute any quirks before the call gates are initialised: */
129 x86_quirk_pre_intr_init();
130
131 /*
132 * Cover the whole vector space, no vector can escape
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */
138 if (i != SYSCALL_VECTOR)
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
140 }
141
142
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 123#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
144 /* 124 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 125 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -168,6 +148,11 @@ void __init native_init_IRQ(void)
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 148 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 149 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
170#endif 150#endif
151}
152
153static void __init apic_intr_init(void)
154{
155 smp_intr_init();
171 156
172#ifdef CONFIG_X86_LOCAL_APIC 157#ifdef CONFIG_X86_LOCAL_APIC
173 /* self generated IPI for local APIC timer */ 158 /* self generated IPI for local APIC timer */
@@ -179,12 +164,41 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 164 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 165 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 166 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
182#endif 167# ifdef CONFIG_PERF_COUNTERS
168 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
169 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
170# endif
183 171
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 172# ifdef CONFIG_X86_MCE_P4THERMAL
185 /* thermal monitor LVT interrupt */ 173 /* thermal monitor LVT interrupt */
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 174 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
175# endif
187#endif 176#endif
177}
178
179/* Overridden in paravirt.c */
180void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
181
182void __init native_init_IRQ(void)
183{
184 int i;
185
186 /* Execute any quirks before the call gates are initialised: */
187 x86_quirk_pre_intr_init();
188
189 apic_intr_init();
190
191 /*
192 * Cover the whole vector space, no vector can escape
193 * us. (some of these will be overridden and become
194 * 'special' SMP interrupts)
195 */
196 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
197 int vector = FIRST_EXTERNAL_VECTOR + i;
198 /* SYSCALL_VECTOR was reserved in trap_init. */
199 if (!test_bit(vector, used_vectors))
200 set_intr_gate(vector, interrupt[i]);
201 }
188 202
189 if (!acpi_ioapic) 203 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 204 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd10537fd46..53ceb26f80ff 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -152,6 +152,12 @@ static void __init apic_intr_init(void)
152 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155
156 /* Performance monitoring interrupt: */
157#ifdef CONFIG_PERF_COUNTERS
158 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
159 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
160#endif
155} 161}
156 162
157void __init native_init_IRQ(void) 163void __init native_init_IRQ(void)
@@ -159,6 +165,9 @@ void __init native_init_IRQ(void)
159 int i; 165 int i;
160 166
161 init_ISA_irqs(); 167 init_ISA_irqs();
168
169 apic_intr_init();
170
162 /* 171 /*
163 * Cover the whole vector space, no vector can escape 172 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become 173 * us. (some of these will be overridden and become
@@ -166,12 +175,10 @@ void __init native_init_IRQ(void)
166 */ 175 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i; 177 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR) 178 if (!test_bit(vector, used_vectors))
170 set_intr_gate(vector, interrupt[i]); 179 set_intr_gate(vector, interrupt[i]);
171 } 180 }
172 181
173 apic_intr_init();
174
175 if (!acpi_ioapic) 182 if (!acpi_ioapic)
176 setup_irq(2, &irq2); 183 setup_irq(2, &irq2);
177} 184}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..c3ebbb901379 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,5 +332,6 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
335 .long sys_preadv 336 .long sys_preadv
336 .long sys_pwritev 337 .long sys_pwritev
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..2cc162e09c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -945,8 +945,13 @@ void __init trap_init(void)
945#endif 945#endif
946 set_intr_gate(19, &simd_coprocessor_error); 946 set_intr_gate(19, &simd_coprocessor_error);
947 947
948 /* Reserve all the builtin and the syscall vector: */
949 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
950 set_bit(i, used_vectors);
951
948#ifdef CONFIG_IA32_EMULATION 952#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 953 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
954 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 955#endif
951 956
952#ifdef CONFIG_X86_32 957#ifdef CONFIG_X86_32
@@ -963,17 +968,9 @@ void __init trap_init(void)
963 } 968 }
964 969
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 970 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
966#endif
967
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else
975 set_bit(SYSCALL_VECTOR, used_vectors); 971 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif 972#endif
973
977 /* 974 /*
978 * Should be a barrier for any external CPU state: 975 * Should be a barrier for any external CPU state:
979 */ 976 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..f2d3324d9215 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
27#include <linux/tty.h> 27#include <linux/tty.h>
28#include <linux/smp.h> 28#include <linux/smp.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/perf_counter.h>
30 31
31#include <asm-generic/sections.h> 32#include <asm-generic/sections.h>
32 33
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1045 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1046 pgtable_bad(regs, error_code, address);
1046 1047
1048 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
1049
1047 /* 1050 /*
1048 * If we're in an interrupt, have no user context or are running 1051 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1052 * in an atomic region then we must not take the fault:
@@ -1137,10 +1140,13 @@ good_area:
1137 return; 1140 return;
1138 } 1141 }
1139 1142
1140 if (fault & VM_FAULT_MAJOR) 1143 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1144 tsk->maj_flt++;
1142 else 1145 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
1146 } else {
1143 tsk->min_flt++; 1147 tsk->min_flt++;
1148 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
1149 }
1144 1150
1145 check_v8086_mode(regs, address, tsk); 1151 check_v8086_mode(regs, address, tsk);
1146 1152
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..c638685136e1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 4e6e758bd397..429be896a030 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -757,8 +757,11 @@ static int acpi_idle_bm_check(void)
757 */ 757 */
758static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 758static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
759{ 759{
760 u64 perf_flags;
761
760 /* Don't trace irqs off for idle */ 762 /* Don't trace irqs off for idle */
761 stop_critical_timings(); 763 stop_critical_timings();
764 perf_flags = hw_perf_save_disable();
762 if (cx->entry_method == ACPI_CSTATE_FFH) { 765 if (cx->entry_method == ACPI_CSTATE_FFH) {
763 /* Call into architectural FFH based C-state */ 766 /* Call into architectural FFH based C-state */
764 acpi_processor_ffh_cstate_enter(cx); 767 acpi_processor_ffh_cstate_enter(cx);
@@ -773,6 +776,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
773 gets asserted in time to freeze execution properly. */ 776 gets asserted in time to freeze execution properly. */
774 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 777 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
775 } 778 }
779 hw_perf_restore(perf_flags);
776 start_critical_timings(); 780 start_critical_timings();
777} 781}
778 782
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 6de020d078e1..0540d5de2c17 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 052a961e41aa..e015c0b5a082 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -1018,6 +1019,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1018 1019
1019 current->personality &= ~bprm->per_clear; 1020 current->personality &= ~bprm->per_clear;
1020 1021
1022 /*
1023 * Flush performance counters when crossing a
1024 * security domain:
1025 */
1026 if (!get_dumpable(current->mm))
1027 perf_counter_exit_task(current);
1028
1021 /* An exec changes our domain. We are no longer part of the thread 1029 /* An exec changes our domain. We are no longer part of the thread
1022 group */ 1030 group */
1023 1031
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index af1de95e711e..ca226a91abee 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -120,6 +120,18 @@ extern struct group_info init_groups;
120 120
121extern struct cred init_cred; 121extern struct cred init_cred;
122 122
123#ifdef CONFIG_PERF_COUNTERS
124# define INIT_PERF_COUNTERS(tsk) \
125 .perf_counter_ctx.counter_list = \
126 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
127 .perf_counter_ctx.event_list = \
128 LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \
129 .perf_counter_ctx.lock = \
130 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
131#else
132# define INIT_PERF_COUNTERS(tsk)
133#endif
134
123/* 135/*
124 * INIT_TASK is used to set up the first task table, touch at 136 * INIT_TASK is used to set up the first task table, touch at
125 * your own risk!. Base=0, limit=0x1fffff (=2MB) 137 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -185,6 +197,7 @@ extern struct cred init_cred;
185 INIT_IDS \ 197 INIT_IDS \
186 INIT_TRACE_IRQFLAGS \ 198 INIT_TRACE_IRQFLAGS \
187 INIT_LOCKDEP \ 199 INIT_LOCKDEP \
200 INIT_PERF_COUNTERS(tsk) \
188} 201}
189 202
190 203
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0c8b89f28a95..080d1fd461d7 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,13 @@ static inline unsigned int kstat_irqs(unsigned int irq)
81 return sum; 81 return sum;
82} 82}
83 83
84
85/*
86 * Lock/unlock the current runqueue - to extract task statistics:
87 */
88extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
84extern unsigned long long task_delta_exec(struct task_struct *); 89extern unsigned long long task_delta_exec(struct task_struct *);
90
85extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 91extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
86extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 92extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
87extern void account_steal_time(cputime_t); 93extern void account_steal_time(cputime_t);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab8..93054fc3635c 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
151extern int mutex_trylock(struct mutex *lock); 151extern int mutex_trylock(struct mutex *lock);
152extern void mutex_unlock(struct mutex *lock); 152extern void mutex_unlock(struct mutex *lock);
153 153
154/**
155 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
156 * @cnt: the atomic which we are to dec
157 * @lock: the mutex to return holding if we dec to 0
158 *
159 * return true and hold lock if we dec to 0, return false otherwise
160 */
161static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
162{
163 /* dec if we can't possibly hit 0 */
164 if (atomic_add_unless(cnt, -1, 1))
165 return 0;
166 /* we might hit 0, so take the lock */
167 mutex_lock(lock);
168 if (!atomic_dec_and_test(cnt)) {
169 /* when we actually did the dec, we didn't hit 0 */
170 mutex_unlock(lock);
171 return 0;
172 }
173 /* we hit 0, and we hold the lock */
174 return 1;
175}
176
154#endif 177#endif
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..7f5d353d78ac
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,591 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <linux/types.h>
17#include <linux/ioctl.h>
18#include <asm/byteorder.h>
19
20/*
21 * User-space ABI bits:
22 */
23
24/*
25 * hw_event.type
26 */
27enum perf_event_types {
28 PERF_TYPE_HARDWARE = 0,
29 PERF_TYPE_SOFTWARE = 1,
30 PERF_TYPE_TRACEPOINT = 2,
31
32 /*
33 * available TYPE space, raw is the max value.
34 */
35
36 PERF_TYPE_RAW = 128,
37};
38
39/*
40 * Generalized performance counter event types, used by the hw_event.event_id
41 * parameter of the sys_perf_counter_open() syscall:
42 */
43enum hw_event_ids {
44 /*
45 * Common hardware events, generalized by the kernel:
46 */
47 PERF_COUNT_CPU_CYCLES = 0,
48 PERF_COUNT_INSTRUCTIONS = 1,
49 PERF_COUNT_CACHE_REFERENCES = 2,
50 PERF_COUNT_CACHE_MISSES = 3,
51 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
52 PERF_COUNT_BRANCH_MISSES = 5,
53 PERF_COUNT_BUS_CYCLES = 6,
54
55 PERF_HW_EVENTS_MAX = 7,
56};
57
58/*
59 * Special "software" counters provided by the kernel, even if the hardware
60 * does not support performance counters. These counters measure various
61 * physical and sw events of the kernel (and allow the profiling of them as
62 * well):
63 */
64enum sw_event_ids {
65 PERF_COUNT_CPU_CLOCK = 0,
66 PERF_COUNT_TASK_CLOCK = 1,
67 PERF_COUNT_PAGE_FAULTS = 2,
68 PERF_COUNT_CONTEXT_SWITCHES = 3,
69 PERF_COUNT_CPU_MIGRATIONS = 4,
70 PERF_COUNT_PAGE_FAULTS_MIN = 5,
71 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
72
73 PERF_SW_EVENTS_MAX = 7,
74};
75
76#define __PERF_COUNTER_MASK(name) \
77 (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
78 PERF_COUNTER_##name##_SHIFT)
79
80#define PERF_COUNTER_RAW_BITS 1
81#define PERF_COUNTER_RAW_SHIFT 63
82#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
83
84#define PERF_COUNTER_CONFIG_BITS 63
85#define PERF_COUNTER_CONFIG_SHIFT 0
86#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
87
88#define PERF_COUNTER_TYPE_BITS 7
89#define PERF_COUNTER_TYPE_SHIFT 56
90#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
91
92#define PERF_COUNTER_EVENT_BITS 56
93#define PERF_COUNTER_EVENT_SHIFT 0
94#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
95
96/*
97 * Bits that can be set in hw_event.record_type to request information
98 * in the overflow packets.
99 */
100enum perf_counter_record_format {
101 PERF_RECORD_IP = 1U << 0,
102 PERF_RECORD_TID = 1U << 1,
103 PERF_RECORD_GROUP = 1U << 2,
104 PERF_RECORD_CALLCHAIN = 1U << 3,
105 PERF_RECORD_TIME = 1U << 4,
106};
107
108/*
109 * Bits that can be set in hw_event.read_format to request that
110 * reads on the counter should return the indicated quantities,
111 * in increasing order of bit value, after the counter value.
112 */
113enum perf_counter_read_format {
114 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
115 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
116};
117
118/*
119 * Hardware event to monitor via a performance monitoring counter:
120 */
121struct perf_counter_hw_event {
122 /*
123 * The MSB of the config word signifies if the rest contains cpu
124 * specific (raw) counter configuration data, if unset, the next
125 * 7 bits are an event type and the rest of the bits are the event
126 * identifier.
127 */
128 __u64 config;
129
130 __u64 irq_period;
131 __u32 record_type;
132 __u32 read_format;
133
134 __u64 disabled : 1, /* off by default */
135 nmi : 1, /* NMI sampling */
136 inherit : 1, /* children inherit it */
137 pinned : 1, /* must always be on PMU */
138 exclusive : 1, /* only group on PMU */
139 exclude_user : 1, /* don't count user */
140 exclude_kernel : 1, /* ditto kernel */
141 exclude_hv : 1, /* ditto hypervisor */
142 exclude_idle : 1, /* don't count when idle */
143 mmap : 1, /* include mmap data */
144 munmap : 1, /* include munmap data */
145
146 __reserved_1 : 53;
147
148 __u32 extra_config_len;
149 __u32 wakeup_events; /* wakeup every n events */
150
151 __u64 __reserved_2;
152 __u64 __reserved_3;
153};
154
155/*
156 * Ioctls that can be done on a perf counter fd:
157 */
158#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0)
159#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1)
160#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32)
161
162/*
163 * Structure of the page that can be mapped via mmap
164 */
165struct perf_counter_mmap_page {
166 __u32 version; /* version number of this structure */
167 __u32 compat_version; /* lowest version this is compat with */
168
169 /*
170 * Bits needed to read the hw counters in user-space.
171 *
172 * u32 seq;
173 * s64 count;
174 *
175 * do {
176 * seq = pc->lock;
177 *
178 * barrier()
179 * if (pc->index) {
180 * count = pmc_read(pc->index - 1);
181 * count += pc->offset;
182 * } else
183 * goto regular_read;
184 *
185 * barrier();
186 * } while (pc->lock != seq);
187 *
188 * NOTE: for obvious reason this only works on self-monitoring
189 * processes.
190 */
191 __u32 lock; /* seqlock for synchronization */
192 __u32 index; /* hardware counter identifier */
193 __s64 offset; /* add to hardware counter value */
194
195 /*
196 * Control data for the mmap() data buffer.
197 *
198 * User-space reading this value should issue an rmb(), on SMP capable
199 * platforms, after reading this value -- see perf_counter_wakeup().
200 */
201 __u32 data_head; /* head in the data section */
202};
203
204struct perf_event_header {
205 __u32 type;
206 __u32 size;
207};
208
209enum perf_event_type {
210
211 /*
212 * The MMAP events record the PROT_EXEC mappings so that we can
213 * correlate userspace IPs to code. They have the following structure:
214 *
215 * struct {
216 * struct perf_event_header header;
217 *
218 * u32 pid, tid;
219 * u64 addr;
220 * u64 len;
221 * u64 pgoff;
222 * char filename[];
223 * };
224 */
225 PERF_EVENT_MMAP = 1,
226 PERF_EVENT_MUNMAP = 2,
227
228 /*
229 * Half the event type space is reserved for the counter overflow
230 * bitfields, as found in hw_event.record_type.
231 *
232 * These events will have types of the form:
233 * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
234 *
235 * struct {
236 * struct perf_event_header header;
237 *
238 * { u64 ip; } && __PERF_EVENT_IP
239 * { u32 pid, tid; } && __PERF_EVENT_TID
240 *
241 * { u64 nr;
242 * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP
243 *
244 * { u16 nr,
245 * hv,
246 * kernel,
247 * user;
248 * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN
249 *
250 * { u64 time; } && __PERF_EVENT_TIME
251 * };
252 */
253 PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31,
254 __PERF_EVENT_IP = PERF_RECORD_IP,
255 __PERF_EVENT_TID = PERF_RECORD_TID,
256 __PERF_EVENT_GROUP = PERF_RECORD_GROUP,
257 __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN,
258 __PERF_EVENT_TIME = PERF_RECORD_TIME,
259};
260
261#ifdef __KERNEL__
262/*
263 * Kernel-internal data types and definitions:
264 */
265
266#ifdef CONFIG_PERF_COUNTERS
267# include <asm/perf_counter.h>
268#endif
269
270#include <linux/list.h>
271#include <linux/mutex.h>
272#include <linux/rculist.h>
273#include <linux/rcupdate.h>
274#include <linux/spinlock.h>
275#include <linux/hrtimer.h>
276#include <linux/fs.h>
277#include <asm/atomic.h>
278
279struct task_struct;
280
281static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
282{
283 return hw_event->config & PERF_COUNTER_RAW_MASK;
284}
285
286static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
287{
288 return hw_event->config & PERF_COUNTER_CONFIG_MASK;
289}
290
291static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
292{
293 return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
294 PERF_COUNTER_TYPE_SHIFT;
295}
296
297static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
298{
299 return hw_event->config & PERF_COUNTER_EVENT_MASK;
300}
301
302/**
303 * struct hw_perf_counter - performance counter hardware details:
304 */
305struct hw_perf_counter {
306#ifdef CONFIG_PERF_COUNTERS
307 union {
308 struct { /* hardware */
309 u64 config;
310 unsigned long config_base;
311 unsigned long counter_base;
312 int nmi;
313 unsigned int idx;
314 };
315 union { /* software */
316 atomic64_t count;
317 struct hrtimer hrtimer;
318 };
319 };
320 atomic64_t prev_count;
321 u64 irq_period;
322 atomic64_t period_left;
323#endif
324};
325
326struct perf_counter;
327
328/**
329 * struct hw_perf_counter_ops - performance counter hw ops
330 */
331struct hw_perf_counter_ops {
332 int (*enable) (struct perf_counter *counter);
333 void (*disable) (struct perf_counter *counter);
334 void (*read) (struct perf_counter *counter);
335};
336
337/**
338 * enum perf_counter_active_state - the states of a counter
339 */
340enum perf_counter_active_state {
341 PERF_COUNTER_STATE_ERROR = -2,
342 PERF_COUNTER_STATE_OFF = -1,
343 PERF_COUNTER_STATE_INACTIVE = 0,
344 PERF_COUNTER_STATE_ACTIVE = 1,
345};
346
347struct file;
348
349struct perf_mmap_data {
350 struct rcu_head rcu_head;
351 int nr_pages;
352 atomic_t wakeup;
353 atomic_t head;
354 atomic_t events;
355 struct perf_counter_mmap_page *user_page;
356 void *data_pages[0];
357};
358
359struct perf_pending_entry {
360 struct perf_pending_entry *next;
361 void (*func)(struct perf_pending_entry *);
362};
363
364/**
365 * struct perf_counter - performance counter kernel representation:
366 */
367struct perf_counter {
368#ifdef CONFIG_PERF_COUNTERS
369 struct list_head list_entry;
370 struct list_head event_entry;
371 struct list_head sibling_list;
372 int nr_siblings;
373 struct perf_counter *group_leader;
374 const struct hw_perf_counter_ops *hw_ops;
375
376 enum perf_counter_active_state state;
377 enum perf_counter_active_state prev_state;
378 atomic64_t count;
379
380 /*
381 * These are the total time in nanoseconds that the counter
382 * has been enabled (i.e. eligible to run, and the task has
383 * been scheduled in, if this is a per-task counter)
384 * and running (scheduled onto the CPU), respectively.
385 *
386 * They are computed from tstamp_enabled, tstamp_running and
387 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
388 */
389 u64 total_time_enabled;
390 u64 total_time_running;
391
392 /*
393 * These are timestamps used for computing total_time_enabled
394 * and total_time_running when the counter is in INACTIVE or
395 * ACTIVE state, measured in nanoseconds from an arbitrary point
396 * in time.
397 * tstamp_enabled: the notional time when the counter was enabled
398 * tstamp_running: the notional time when the counter was scheduled on
399 * tstamp_stopped: in INACTIVE state, the notional time when the
400 * counter was scheduled off.
401 */
402 u64 tstamp_enabled;
403 u64 tstamp_running;
404 u64 tstamp_stopped;
405
406 struct perf_counter_hw_event hw_event;
407 struct hw_perf_counter hw;
408
409 struct perf_counter_context *ctx;
410 struct task_struct *task;
411 struct file *filp;
412
413 struct perf_counter *parent;
414 struct list_head child_list;
415
416 /*
417 * These accumulate total time (in nanoseconds) that children
418 * counters have been enabled and running, respectively.
419 */
420 atomic64_t child_total_time_enabled;
421 atomic64_t child_total_time_running;
422
423 /*
424 * Protect attach/detach and child_list:
425 */
426 struct mutex mutex;
427
428 int oncpu;
429 int cpu;
430
431 /* mmap bits */
432 struct mutex mmap_mutex;
433 atomic_t mmap_count;
434 struct perf_mmap_data *data;
435
436 /* poll related */
437 wait_queue_head_t waitq;
438 struct fasync_struct *fasync;
439
440 /* delayed work for NMIs and such */
441 int pending_wakeup;
442 int pending_kill;
443 int pending_disable;
444 struct perf_pending_entry pending;
445
446 atomic_t event_limit;
447
448 void (*destroy)(struct perf_counter *);
449 struct rcu_head rcu_head;
450#endif
451};
452
453/**
454 * struct perf_counter_context - counter context structure
455 *
456 * Used as a container for task counters and CPU counters as well:
457 */
458struct perf_counter_context {
459#ifdef CONFIG_PERF_COUNTERS
460 /*
461 * Protect the states of the counters in the list,
462 * nr_active, and the list:
463 */
464 spinlock_t lock;
465 /*
466 * Protect the list of counters. Locking either mutex or lock
467 * is sufficient to ensure the list doesn't change; to change
468 * the list you need to lock both the mutex and the spinlock.
469 */
470 struct mutex mutex;
471
472 struct list_head counter_list;
473 struct list_head event_list;
474 int nr_counters;
475 int nr_active;
476 int is_active;
477 struct task_struct *task;
478
479 /*
480 * Context clock, runs when context enabled.
481 */
482 u64 time;
483 u64 timestamp;
484#endif
485};
486
487/**
488 * struct perf_counter_cpu_context - per cpu counter context structure
489 */
490struct perf_cpu_context {
491 struct perf_counter_context ctx;
492 struct perf_counter_context *task_ctx;
493 int active_oncpu;
494 int max_pertask;
495 int exclusive;
496
497 /*
498 * Recursion avoidance:
499 *
500 * task, softirq, irq, nmi context
501 */
502 int recursion[4];
503};
504
505/*
506 * Set by architecture code:
507 */
508extern int perf_max_counters;
509
510#ifdef CONFIG_PERF_COUNTERS
511extern const struct hw_perf_counter_ops *
512hw_perf_counter_init(struct perf_counter *counter);
513
514extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
515extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
516extern void perf_counter_task_tick(struct task_struct *task, int cpu);
517extern void perf_counter_init_task(struct task_struct *child);
518extern void perf_counter_exit_task(struct task_struct *child);
519extern void perf_counter_do_pending(void);
520extern void perf_counter_print_debug(void);
521extern void perf_counter_unthrottle(void);
522extern u64 hw_perf_save_disable(void);
523extern void hw_perf_restore(u64 ctrl);
524extern int perf_counter_task_disable(void);
525extern int perf_counter_task_enable(void);
526extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
527 struct perf_cpu_context *cpuctx,
528 struct perf_counter_context *ctx, int cpu);
529extern void perf_counter_update_userpage(struct perf_counter *counter);
530
531extern int perf_counter_overflow(struct perf_counter *counter,
532 int nmi, struct pt_regs *regs);
533/*
534 * Return 1 for a software counter, 0 for a hardware counter
535 */
536static inline int is_software_counter(struct perf_counter *counter)
537{
538 return !perf_event_raw(&counter->hw_event) &&
539 perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
540}
541
542extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
543
544extern void perf_counter_mmap(unsigned long addr, unsigned long len,
545 unsigned long pgoff, struct file *file);
546
547extern void perf_counter_munmap(unsigned long addr, unsigned long len,
548 unsigned long pgoff, struct file *file);
549
550#define MAX_STACK_DEPTH 255
551
552struct perf_callchain_entry {
553 u16 nr, hv, kernel, user;
554 u64 ip[MAX_STACK_DEPTH];
555};
556
557extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
558
559#else
560static inline void
561perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
562static inline void
563perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
564static inline void
565perf_counter_task_tick(struct task_struct *task, int cpu) { }
566static inline void perf_counter_init_task(struct task_struct *child) { }
567static inline void perf_counter_exit_task(struct task_struct *child) { }
568static inline void perf_counter_do_pending(void) { }
569static inline void perf_counter_print_debug(void) { }
570static inline void perf_counter_unthrottle(void) { }
571static inline void hw_perf_restore(u64 ctrl) { }
572static inline u64 hw_perf_save_disable(void) { return 0; }
573static inline int perf_counter_task_disable(void) { return -EINVAL; }
574static inline int perf_counter_task_enable(void) { return -EINVAL; }
575
576static inline void
577perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { }
578
579
580static inline void
581perf_counter_mmap(unsigned long addr, unsigned long len,
582 unsigned long pgoff, struct file *file) { }
583
584static inline void
585perf_counter_munmap(unsigned long addr, unsigned long len,
586 unsigned long pgoff, struct file *file) { }
587
588#endif
589
590#endif /* __KERNEL__ */
591#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b94f3541f67b..7ed41f7c5ace 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/path.h> 71#include <linux/path.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -137,6 +138,7 @@ extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void); 139extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 140extern unsigned long nr_iowait(void);
141extern u64 cpu_nr_migrations(int cpu);
140 142
141extern unsigned long get_parent_ip(unsigned long addr); 143extern unsigned long get_parent_ip(unsigned long addr);
142 144
@@ -1048,9 +1050,10 @@ struct sched_entity {
1048 u64 last_wakeup; 1050 u64 last_wakeup;
1049 u64 avg_overlap; 1051 u64 avg_overlap;
1050 1052
1053 u64 nr_migrations;
1054
1051 u64 start_runtime; 1055 u64 start_runtime;
1052 u64 avg_wakeup; 1056 u64 avg_wakeup;
1053 u64 nr_migrations;
1054 1057
1055#ifdef CONFIG_SCHEDSTATS 1058#ifdef CONFIG_SCHEDSTATS
1056 u64 wait_start; 1059 u64 wait_start;
@@ -1372,6 +1375,7 @@ struct task_struct {
1372 struct list_head pi_state_list; 1375 struct list_head pi_state_list;
1373 struct futex_pi_state *pi_state_cache; 1376 struct futex_pi_state *pi_state_cache;
1374#endif 1377#endif
1378 struct perf_counter_context perf_counter_ctx;
1375#ifdef CONFIG_NUMA 1379#ifdef CONFIG_NUMA
1376 struct mempolicy *mempolicy; 1380 struct mempolicy *mempolicy;
1377 short il_next; 1381 short il_next;
@@ -2380,6 +2384,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2380#define TASK_SIZE_OF(tsk) TASK_SIZE 2384#define TASK_SIZE_OF(tsk) TASK_SIZE
2381#endif 2385#endif
2382 2386
2387/*
2388 * Call the function if the target task is executing on a CPU right now:
2389 */
2390extern void task_oncpu_function_call(struct task_struct *p,
2391 void (*func) (void *info), void *info);
2392
2393
2383#ifdef CONFIG_MM_OWNER 2394#ifdef CONFIG_MM_OWNER
2384extern void mm_update_next_owner(struct mm_struct *mm); 2395extern void mm_update_next_owner(struct mm_struct *mm);
2385extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2396extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6470f74074af..471143bf2aae 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_hw_event;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -754,4 +755,8 @@ asmlinkage long sys_pipe(int __user *);
754 755
755int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 756int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
756 757
758
759asmlinkage long sys_perf_counter_open(
760 const struct perf_counter_hw_event __user *hw_event_uptr,
761 pid_t pid, int cpu, int group_fd, unsigned long flags);
757#endif 762#endif
diff --git a/init/Kconfig b/init/Kconfig
index c52d1d48272a..35659ed442e5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -919,6 +919,41 @@ config AIO
919 by some high performance threaded applications. Disabling 919 by some high performance threaded applications. Disabling
920 this option saves about 7k. 920 this option saves about 7k.
921 921
922config HAVE_PERF_COUNTERS
923 bool
924
925menu "Performance Counters"
926
927config PERF_COUNTERS
928 bool "Kernel Performance Counters"
929 depends on HAVE_PERF_COUNTERS
930 default y
931 select ANON_INODES
932 help
933 Enable kernel support for performance counter hardware.
934
935 Performance counters are special hardware registers available
936 on most modern CPUs. These registers count the number of certain
937 types of hw events: such as instructions executed, cachemisses
938 suffered, or branches mis-predicted - without slowing down the
939 kernel or applications. These registers can also trigger interrupts
940 when a threshold number of events have passed - and can thus be
941 used to profile the code that runs on that CPU.
942
943 The Linux Performance Counter subsystem provides an abstraction of
944 these hardware capabilities, available via a system call. It
945 provides per task and per CPU counters, and it provides event
946 capabilities on top of those.
947
948 Say Y if unsure.
949
950config EVENT_PROFILE
951 bool "Tracepoint profile sources"
952 depends on PERF_COUNTERS && EVENT_TRACER
953 default y
954
955endmenu
956
922config VM_EVENT_COUNTERS 957config VM_EVENT_COUNTERS
923 default y 958 default y
924 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 959 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e9..63c697529ca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_SLOW_WORK) += slow-work.o 96obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
97 98
98ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
99# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index 32cbf2607cb0..fbb5d94c8bbc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -158,6 +158,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 158{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 160
161#ifdef CONFIG_PERF_COUNTERS
162 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
163#endif
161 trace_sched_process_free(tsk); 164 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 165 put_task_struct(tsk);
163} 166}
@@ -979,10 +982,6 @@ NORET_TYPE void do_exit(long code)
979 tsk->mempolicy = NULL; 982 tsk->mempolicy = NULL;
980#endif 983#endif
981#ifdef CONFIG_FUTEX 984#ifdef CONFIG_FUTEX
982 /*
983 * This must happen late, after the PID is not
984 * hashed anymore:
985 */
986 if (unlikely(!list_empty(&tsk->pi_state_list))) 985 if (unlikely(!list_empty(&tsk->pi_state_list)))
987 exit_pi_state_list(tsk); 986 exit_pi_state_list(tsk);
988 if (unlikely(current->pi_state_cache)) 987 if (unlikely(current->pi_state_cache))
@@ -1249,6 +1248,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1249 */ 1248 */
1250 read_unlock(&tasklist_lock); 1249 read_unlock(&tasklist_lock);
1251 1250
1251 /*
1252 * Flush inherited counters to the parent - before the parent
1253 * gets woken up by child-exit notifications.
1254 */
1255 perf_counter_exit_task(p);
1256
1252 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1257 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1253 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1258 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1254 ? p->signal->group_exit_code : p->exit_code; 1259 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765bc..381d7f9b70fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
975 goto fork_out; 975 goto fork_out;
976 976
977 rt_mutex_init_task(p); 977 rt_mutex_init_task(p);
978 perf_counter_init_task(p);
978 979
979#ifdef CONFIG_PROVE_LOCKING 980#ifdef CONFIG_PROVE_LOCKING
980 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 981 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a3..fd95eaa672e6 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..863703b3158f
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,3150 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 *
8 * For licensing details see kernel-base/COPYING
9 */
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/file.h>
16#include <linux/poll.h>
17#include <linux/sysfs.h>
18#include <linux/ptrace.h>
19#include <linux/percpu.h>
20#include <linux/vmstat.h>
21#include <linux/hardirq.h>
22#include <linux/rculist.h>
23#include <linux/uaccess.h>
24#include <linux/syscalls.h>
25#include <linux/anon_inodes.h>
26#include <linux/kernel_stat.h>
27#include <linux/perf_counter.h>
28#include <linux/dcache.h>
29
30#include <asm/irq_regs.h>
31
32/*
33 * Each CPU has a list of per CPU counters:
34 */
35DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
36
37int perf_max_counters __read_mostly = 1;
38static int perf_reserved_percpu __read_mostly;
39static int perf_overcommit __read_mostly = 1;
40
41/*
42 * Mutex for (sysadmin-configurable) counter reservations:
43 */
44static DEFINE_MUTEX(perf_resource_mutex);
45
46/*
47 * Architecture provided APIs - weak aliases:
48 */
49extern __weak const struct hw_perf_counter_ops *
50hw_perf_counter_init(struct perf_counter *counter)
51{
52 return NULL;
53}
54
55u64 __weak hw_perf_save_disable(void) { return 0; }
56void __weak hw_perf_restore(u64 ctrl) { barrier(); }
57void __weak hw_perf_counter_setup(int cpu) { barrier(); }
58int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
59 struct perf_cpu_context *cpuctx,
60 struct perf_counter_context *ctx, int cpu)
61{
62 return 0;
63}
64
65void __weak perf_counter_print_debug(void) { }
66
67static void
68list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
69{
70 struct perf_counter *group_leader = counter->group_leader;
71
72 /*
73 * Depending on whether it is a standalone or sibling counter,
74 * add it straight to the context's counter list, or to the group
75 * leader's sibling list:
76 */
77 if (counter->group_leader == counter)
78 list_add_tail(&counter->list_entry, &ctx->counter_list);
79 else {
80 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
81 group_leader->nr_siblings++;
82 }
83
84 list_add_rcu(&counter->event_entry, &ctx->event_list);
85}
86
87static void
88list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
89{
90 struct perf_counter *sibling, *tmp;
91
92 list_del_init(&counter->list_entry);
93 list_del_rcu(&counter->event_entry);
94
95 if (counter->group_leader != counter)
96 counter->group_leader->nr_siblings--;
97
98 /*
99 * If this was a group counter with sibling counters then
100 * upgrade the siblings to singleton counters by adding them
101 * to the context list directly:
102 */
103 list_for_each_entry_safe(sibling, tmp,
104 &counter->sibling_list, list_entry) {
105
106 list_move_tail(&sibling->list_entry, &ctx->counter_list);
107 sibling->group_leader = sibling;
108 }
109}
110
111static void
112counter_sched_out(struct perf_counter *counter,
113 struct perf_cpu_context *cpuctx,
114 struct perf_counter_context *ctx)
115{
116 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
117 return;
118
119 counter->state = PERF_COUNTER_STATE_INACTIVE;
120 counter->tstamp_stopped = ctx->time;
121 counter->hw_ops->disable(counter);
122 counter->oncpu = -1;
123
124 if (!is_software_counter(counter))
125 cpuctx->active_oncpu--;
126 ctx->nr_active--;
127 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
128 cpuctx->exclusive = 0;
129}
130
131static void
132group_sched_out(struct perf_counter *group_counter,
133 struct perf_cpu_context *cpuctx,
134 struct perf_counter_context *ctx)
135{
136 struct perf_counter *counter;
137
138 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
139 return;
140
141 counter_sched_out(group_counter, cpuctx, ctx);
142
143 /*
144 * Schedule out siblings (if any):
145 */
146 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
147 counter_sched_out(counter, cpuctx, ctx);
148
149 if (group_counter->hw_event.exclusive)
150 cpuctx->exclusive = 0;
151}
152
153/*
154 * Cross CPU call to remove a performance counter
155 *
156 * We disable the counter on the hardware level first. After that we
157 * remove it from the context list.
158 */
159static void __perf_counter_remove_from_context(void *info)
160{
161 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
162 struct perf_counter *counter = info;
163 struct perf_counter_context *ctx = counter->ctx;
164 unsigned long flags;
165 u64 perf_flags;
166
167 /*
168 * If this is a task context, we need to check whether it is
169 * the current task context of this cpu. If not it has been
170 * scheduled out before the smp call arrived.
171 */
172 if (ctx->task && cpuctx->task_ctx != ctx)
173 return;
174
175 spin_lock_irqsave(&ctx->lock, flags);
176
177 counter_sched_out(counter, cpuctx, ctx);
178
179 counter->task = NULL;
180 ctx->nr_counters--;
181
182 /*
183 * Protect the list operation against NMI by disabling the
184 * counters on a global level. NOP for non NMI based counters.
185 */
186 perf_flags = hw_perf_save_disable();
187 list_del_counter(counter, ctx);
188 hw_perf_restore(perf_flags);
189
190 if (!ctx->task) {
191 /*
192 * Allow more per task counters with respect to the
193 * reservation:
194 */
195 cpuctx->max_pertask =
196 min(perf_max_counters - ctx->nr_counters,
197 perf_max_counters - perf_reserved_percpu);
198 }
199
200 spin_unlock_irqrestore(&ctx->lock, flags);
201}
202
203
204/*
205 * Remove the counter from a task's (or a CPU's) list of counters.
206 *
207 * Must be called with counter->mutex and ctx->mutex held.
208 *
209 * CPU counters are removed with a smp call. For task counters we only
210 * call when the task is on a CPU.
211 */
212static void perf_counter_remove_from_context(struct perf_counter *counter)
213{
214 struct perf_counter_context *ctx = counter->ctx;
215 struct task_struct *task = ctx->task;
216
217 if (!task) {
218 /*
219 * Per cpu counters are removed via an smp call and
220 * the removal is always sucessful.
221 */
222 smp_call_function_single(counter->cpu,
223 __perf_counter_remove_from_context,
224 counter, 1);
225 return;
226 }
227
228retry:
229 task_oncpu_function_call(task, __perf_counter_remove_from_context,
230 counter);
231
232 spin_lock_irq(&ctx->lock);
233 /*
234 * If the context is active we need to retry the smp call.
235 */
236 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
237 spin_unlock_irq(&ctx->lock);
238 goto retry;
239 }
240
241 /*
242 * The lock prevents that this context is scheduled in so we
243 * can remove the counter safely, if the call above did not
244 * succeed.
245 */
246 if (!list_empty(&counter->list_entry)) {
247 ctx->nr_counters--;
248 list_del_counter(counter, ctx);
249 counter->task = NULL;
250 }
251 spin_unlock_irq(&ctx->lock);
252}
253
254static inline u64 perf_clock(void)
255{
256 return cpu_clock(smp_processor_id());
257}
258
259/*
260 * Update the record of the current time in a context.
261 */
262static void update_context_time(struct perf_counter_context *ctx)
263{
264 u64 now = perf_clock();
265
266 ctx->time += now - ctx->timestamp;
267 ctx->timestamp = now;
268}
269
270/*
271 * Update the total_time_enabled and total_time_running fields for a counter.
272 */
273static void update_counter_times(struct perf_counter *counter)
274{
275 struct perf_counter_context *ctx = counter->ctx;
276 u64 run_end;
277
278 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
279 return;
280
281 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
282
283 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
284 run_end = counter->tstamp_stopped;
285 else
286 run_end = ctx->time;
287
288 counter->total_time_running = run_end - counter->tstamp_running;
289}
290
291/*
292 * Update total_time_enabled and total_time_running for all counters in a group.
293 */
294static void update_group_times(struct perf_counter *leader)
295{
296 struct perf_counter *counter;
297
298 update_counter_times(leader);
299 list_for_each_entry(counter, &leader->sibling_list, list_entry)
300 update_counter_times(counter);
301}
302
303/*
304 * Cross CPU call to disable a performance counter
305 */
306static void __perf_counter_disable(void *info)
307{
308 struct perf_counter *counter = info;
309 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
310 struct perf_counter_context *ctx = counter->ctx;
311 unsigned long flags;
312
313 /*
314 * If this is a per-task counter, need to check whether this
315 * counter's task is the current task on this cpu.
316 */
317 if (ctx->task && cpuctx->task_ctx != ctx)
318 return;
319
320 spin_lock_irqsave(&ctx->lock, flags);
321
322 update_context_time(ctx);
323
324 /*
325 * If the counter is on, turn it off.
326 * If it is in error state, leave it in error state.
327 */
328 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
329 update_context_time(ctx);
330 update_counter_times(counter);
331 if (counter == counter->group_leader)
332 group_sched_out(counter, cpuctx, ctx);
333 else
334 counter_sched_out(counter, cpuctx, ctx);
335 counter->state = PERF_COUNTER_STATE_OFF;
336 }
337
338 spin_unlock_irqrestore(&ctx->lock, flags);
339}
340
341/*
342 * Disable a counter.
343 */
344static void perf_counter_disable(struct perf_counter *counter)
345{
346 struct perf_counter_context *ctx = counter->ctx;
347 struct task_struct *task = ctx->task;
348
349 if (!task) {
350 /*
351 * Disable the counter on the cpu that it's on
352 */
353 smp_call_function_single(counter->cpu, __perf_counter_disable,
354 counter, 1);
355 return;
356 }
357
358 retry:
359 task_oncpu_function_call(task, __perf_counter_disable, counter);
360
361 spin_lock_irq(&ctx->lock);
362 /*
363 * If the counter is still active, we need to retry the cross-call.
364 */
365 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
366 spin_unlock_irq(&ctx->lock);
367 goto retry;
368 }
369
370 /*
371 * Since we have the lock this context can't be scheduled
372 * in, so we can change the state safely.
373 */
374 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
375 update_counter_times(counter);
376 counter->state = PERF_COUNTER_STATE_OFF;
377 }
378
379 spin_unlock_irq(&ctx->lock);
380}
381
382/*
383 * Disable a counter and all its children.
384 */
385static void perf_counter_disable_family(struct perf_counter *counter)
386{
387 struct perf_counter *child;
388
389 perf_counter_disable(counter);
390
391 /*
392 * Lock the mutex to protect the list of children
393 */
394 mutex_lock(&counter->mutex);
395 list_for_each_entry(child, &counter->child_list, child_list)
396 perf_counter_disable(child);
397 mutex_unlock(&counter->mutex);
398}
399
400static int
401counter_sched_in(struct perf_counter *counter,
402 struct perf_cpu_context *cpuctx,
403 struct perf_counter_context *ctx,
404 int cpu)
405{
406 if (counter->state <= PERF_COUNTER_STATE_OFF)
407 return 0;
408
409 counter->state = PERF_COUNTER_STATE_ACTIVE;
410 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
411 /*
412 * The new state must be visible before we turn it on in the hardware:
413 */
414 smp_wmb();
415
416 if (counter->hw_ops->enable(counter)) {
417 counter->state = PERF_COUNTER_STATE_INACTIVE;
418 counter->oncpu = -1;
419 return -EAGAIN;
420 }
421
422 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
423
424 if (!is_software_counter(counter))
425 cpuctx->active_oncpu++;
426 ctx->nr_active++;
427
428 if (counter->hw_event.exclusive)
429 cpuctx->exclusive = 1;
430
431 return 0;
432}
433
434/*
435 * Return 1 for a group consisting entirely of software counters,
436 * 0 if the group contains any hardware counters.
437 */
438static int is_software_only_group(struct perf_counter *leader)
439{
440 struct perf_counter *counter;
441
442 if (!is_software_counter(leader))
443 return 0;
444
445 list_for_each_entry(counter, &leader->sibling_list, list_entry)
446 if (!is_software_counter(counter))
447 return 0;
448
449 return 1;
450}
451
452/*
453 * Work out whether we can put this counter group on the CPU now.
454 */
455static int group_can_go_on(struct perf_counter *counter,
456 struct perf_cpu_context *cpuctx,
457 int can_add_hw)
458{
459 /*
460 * Groups consisting entirely of software counters can always go on.
461 */
462 if (is_software_only_group(counter))
463 return 1;
464 /*
465 * If an exclusive group is already on, no other hardware
466 * counters can go on.
467 */
468 if (cpuctx->exclusive)
469 return 0;
470 /*
471 * If this group is exclusive and there are already
472 * counters on the CPU, it can't go on.
473 */
474 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
475 return 0;
476 /*
477 * Otherwise, try to add it if all previous groups were able
478 * to go on.
479 */
480 return can_add_hw;
481}
482
483static void add_counter_to_ctx(struct perf_counter *counter,
484 struct perf_counter_context *ctx)
485{
486 list_add_counter(counter, ctx);
487 ctx->nr_counters++;
488 counter->prev_state = PERF_COUNTER_STATE_OFF;
489 counter->tstamp_enabled = ctx->time;
490 counter->tstamp_running = ctx->time;
491 counter->tstamp_stopped = ctx->time;
492}
493
494/*
495 * Cross CPU call to install and enable a performance counter
496 */
497static void __perf_install_in_context(void *info)
498{
499 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
500 struct perf_counter *counter = info;
501 struct perf_counter_context *ctx = counter->ctx;
502 struct perf_counter *leader = counter->group_leader;
503 int cpu = smp_processor_id();
504 unsigned long flags;
505 u64 perf_flags;
506 int err;
507
508 /*
509 * If this is a task context, we need to check whether it is
510 * the current task context of this cpu. If not it has been
511 * scheduled out before the smp call arrived.
512 */
513 if (ctx->task && cpuctx->task_ctx != ctx)
514 return;
515
516 spin_lock_irqsave(&ctx->lock, flags);
517 update_context_time(ctx);
518
519 /*
520 * Protect the list operation against NMI by disabling the
521 * counters on a global level. NOP for non NMI based counters.
522 */
523 perf_flags = hw_perf_save_disable();
524
525 add_counter_to_ctx(counter, ctx);
526
527 /*
528 * Don't put the counter on if it is disabled or if
529 * it is in a group and the group isn't on.
530 */
531 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
532 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
533 goto unlock;
534
535 /*
536 * An exclusive counter can't go on if there are already active
537 * hardware counters, and no hardware counter can go on if there
538 * is already an exclusive counter on.
539 */
540 if (!group_can_go_on(counter, cpuctx, 1))
541 err = -EEXIST;
542 else
543 err = counter_sched_in(counter, cpuctx, ctx, cpu);
544
545 if (err) {
546 /*
547 * This counter couldn't go on. If it is in a group
548 * then we have to pull the whole group off.
549 * If the counter group is pinned then put it in error state.
550 */
551 if (leader != counter)
552 group_sched_out(leader, cpuctx, ctx);
553 if (leader->hw_event.pinned) {
554 update_group_times(leader);
555 leader->state = PERF_COUNTER_STATE_ERROR;
556 }
557 }
558
559 if (!err && !ctx->task && cpuctx->max_pertask)
560 cpuctx->max_pertask--;
561
562 unlock:
563 hw_perf_restore(perf_flags);
564
565 spin_unlock_irqrestore(&ctx->lock, flags);
566}
567
568/*
569 * Attach a performance counter to a context
570 *
571 * First we add the counter to the list with the hardware enable bit
572 * in counter->hw_config cleared.
573 *
574 * If the counter is attached to a task which is on a CPU we use a smp
575 * call to enable it in the task context. The task might have been
576 * scheduled away, but we check this in the smp call again.
577 *
578 * Must be called with ctx->mutex held.
579 */
580static void
581perf_install_in_context(struct perf_counter_context *ctx,
582 struct perf_counter *counter,
583 int cpu)
584{
585 struct task_struct *task = ctx->task;
586
587 if (!task) {
588 /*
589 * Per cpu counters are installed via an smp call and
590 * the install is always sucessful.
591 */
592 smp_call_function_single(cpu, __perf_install_in_context,
593 counter, 1);
594 return;
595 }
596
597 counter->task = task;
598retry:
599 task_oncpu_function_call(task, __perf_install_in_context,
600 counter);
601
602 spin_lock_irq(&ctx->lock);
603 /*
604 * we need to retry the smp call.
605 */
606 if (ctx->is_active && list_empty(&counter->list_entry)) {
607 spin_unlock_irq(&ctx->lock);
608 goto retry;
609 }
610
611 /*
612 * The lock prevents that this context is scheduled in so we
613 * can add the counter safely, if it the call above did not
614 * succeed.
615 */
616 if (list_empty(&counter->list_entry))
617 add_counter_to_ctx(counter, ctx);
618 spin_unlock_irq(&ctx->lock);
619}
620
621/*
622 * Cross CPU call to enable a performance counter
623 */
624static void __perf_counter_enable(void *info)
625{
626 struct perf_counter *counter = info;
627 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
628 struct perf_counter_context *ctx = counter->ctx;
629 struct perf_counter *leader = counter->group_leader;
630 unsigned long flags;
631 int err;
632
633 /*
634 * If this is a per-task counter, need to check whether this
635 * counter's task is the current task on this cpu.
636 */
637 if (ctx->task && cpuctx->task_ctx != ctx)
638 return;
639
640 spin_lock_irqsave(&ctx->lock, flags);
641 update_context_time(ctx);
642
643 counter->prev_state = counter->state;
644 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
645 goto unlock;
646 counter->state = PERF_COUNTER_STATE_INACTIVE;
647 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
648
649 /*
650 * If the counter is in a group and isn't the group leader,
651 * then don't put it on unless the group is on.
652 */
653 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
654 goto unlock;
655
656 if (!group_can_go_on(counter, cpuctx, 1))
657 err = -EEXIST;
658 else
659 err = counter_sched_in(counter, cpuctx, ctx,
660 smp_processor_id());
661
662 if (err) {
663 /*
664 * If this counter can't go on and it's part of a
665 * group, then the whole group has to come off.
666 */
667 if (leader != counter)
668 group_sched_out(leader, cpuctx, ctx);
669 if (leader->hw_event.pinned) {
670 update_group_times(leader);
671 leader->state = PERF_COUNTER_STATE_ERROR;
672 }
673 }
674
675 unlock:
676 spin_unlock_irqrestore(&ctx->lock, flags);
677}
678
679/*
680 * Enable a counter.
681 */
682static void perf_counter_enable(struct perf_counter *counter)
683{
684 struct perf_counter_context *ctx = counter->ctx;
685 struct task_struct *task = ctx->task;
686
687 if (!task) {
688 /*
689 * Enable the counter on the cpu that it's on
690 */
691 smp_call_function_single(counter->cpu, __perf_counter_enable,
692 counter, 1);
693 return;
694 }
695
696 spin_lock_irq(&ctx->lock);
697 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
698 goto out;
699
700 /*
701 * If the counter is in error state, clear that first.
702 * That way, if we see the counter in error state below, we
703 * know that it has gone back into error state, as distinct
704 * from the task having been scheduled away before the
705 * cross-call arrived.
706 */
707 if (counter->state == PERF_COUNTER_STATE_ERROR)
708 counter->state = PERF_COUNTER_STATE_OFF;
709
710 retry:
711 spin_unlock_irq(&ctx->lock);
712 task_oncpu_function_call(task, __perf_counter_enable, counter);
713
714 spin_lock_irq(&ctx->lock);
715
716 /*
717 * If the context is active and the counter is still off,
718 * we need to retry the cross-call.
719 */
720 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
721 goto retry;
722
723 /*
724 * Since we have the lock this context can't be scheduled
725 * in, so we can change the state safely.
726 */
727 if (counter->state == PERF_COUNTER_STATE_OFF) {
728 counter->state = PERF_COUNTER_STATE_INACTIVE;
729 counter->tstamp_enabled =
730 ctx->time - counter->total_time_enabled;
731 }
732 out:
733 spin_unlock_irq(&ctx->lock);
734}
735
736static void perf_counter_refresh(struct perf_counter *counter, int refresh)
737{
738 atomic_add(refresh, &counter->event_limit);
739 perf_counter_enable(counter);
740}
741
742/*
743 * Enable a counter and all its children.
744 */
745static void perf_counter_enable_family(struct perf_counter *counter)
746{
747 struct perf_counter *child;
748
749 perf_counter_enable(counter);
750
751 /*
752 * Lock the mutex to protect the list of children
753 */
754 mutex_lock(&counter->mutex);
755 list_for_each_entry(child, &counter->child_list, child_list)
756 perf_counter_enable(child);
757 mutex_unlock(&counter->mutex);
758}
759
760void __perf_counter_sched_out(struct perf_counter_context *ctx,
761 struct perf_cpu_context *cpuctx)
762{
763 struct perf_counter *counter;
764 u64 flags;
765
766 spin_lock(&ctx->lock);
767 ctx->is_active = 0;
768 if (likely(!ctx->nr_counters))
769 goto out;
770 update_context_time(ctx);
771
772 flags = hw_perf_save_disable();
773 if (ctx->nr_active) {
774 list_for_each_entry(counter, &ctx->counter_list, list_entry)
775 group_sched_out(counter, cpuctx, ctx);
776 }
777 hw_perf_restore(flags);
778 out:
779 spin_unlock(&ctx->lock);
780}
781
782/*
783 * Called from scheduler to remove the counters of the current task,
784 * with interrupts disabled.
785 *
786 * We stop each counter and update the counter value in counter->count.
787 *
788 * This does not protect us against NMI, but disable()
789 * sets the disabled bit in the control field of counter _before_
790 * accessing the counter control register. If a NMI hits, then it will
791 * not restart the counter.
792 */
793void perf_counter_task_sched_out(struct task_struct *task, int cpu)
794{
795 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
796 struct perf_counter_context *ctx = &task->perf_counter_ctx;
797 struct pt_regs *regs;
798
799 if (likely(!cpuctx->task_ctx))
800 return;
801
802 update_context_time(ctx);
803
804 regs = task_pt_regs(task);
805 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
806 __perf_counter_sched_out(ctx, cpuctx);
807
808 cpuctx->task_ctx = NULL;
809}
810
811static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
812{
813 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
814}
815
816static int
817group_sched_in(struct perf_counter *group_counter,
818 struct perf_cpu_context *cpuctx,
819 struct perf_counter_context *ctx,
820 int cpu)
821{
822 struct perf_counter *counter, *partial_group;
823 int ret;
824
825 if (group_counter->state == PERF_COUNTER_STATE_OFF)
826 return 0;
827
828 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
829 if (ret)
830 return ret < 0 ? ret : 0;
831
832 group_counter->prev_state = group_counter->state;
833 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
834 return -EAGAIN;
835
836 /*
837 * Schedule in siblings as one group (if any):
838 */
839 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
840 counter->prev_state = counter->state;
841 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
842 partial_group = counter;
843 goto group_error;
844 }
845 }
846
847 return 0;
848
849group_error:
850 /*
851 * Groups can be scheduled in as one unit only, so undo any
852 * partial group before returning:
853 */
854 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
855 if (counter == partial_group)
856 break;
857 counter_sched_out(counter, cpuctx, ctx);
858 }
859 counter_sched_out(group_counter, cpuctx, ctx);
860
861 return -EAGAIN;
862}
863
864static void
865__perf_counter_sched_in(struct perf_counter_context *ctx,
866 struct perf_cpu_context *cpuctx, int cpu)
867{
868 struct perf_counter *counter;
869 u64 flags;
870 int can_add_hw = 1;
871
872 spin_lock(&ctx->lock);
873 ctx->is_active = 1;
874 if (likely(!ctx->nr_counters))
875 goto out;
876
877 ctx->timestamp = perf_clock();
878
879 flags = hw_perf_save_disable();
880
881 /*
882 * First go through the list and put on any pinned groups
883 * in order to give them the best chance of going on.
884 */
885 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
886 if (counter->state <= PERF_COUNTER_STATE_OFF ||
887 !counter->hw_event.pinned)
888 continue;
889 if (counter->cpu != -1 && counter->cpu != cpu)
890 continue;
891
892 if (group_can_go_on(counter, cpuctx, 1))
893 group_sched_in(counter, cpuctx, ctx, cpu);
894
895 /*
896 * If this pinned group hasn't been scheduled,
897 * put it in error state.
898 */
899 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
900 update_group_times(counter);
901 counter->state = PERF_COUNTER_STATE_ERROR;
902 }
903 }
904
905 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
906 /*
907 * Ignore counters in OFF or ERROR state, and
908 * ignore pinned counters since we did them already.
909 */
910 if (counter->state <= PERF_COUNTER_STATE_OFF ||
911 counter->hw_event.pinned)
912 continue;
913
914 /*
915 * Listen to the 'cpu' scheduling filter constraint
916 * of counters:
917 */
918 if (counter->cpu != -1 && counter->cpu != cpu)
919 continue;
920
921 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
922 if (group_sched_in(counter, cpuctx, ctx, cpu))
923 can_add_hw = 0;
924 }
925 }
926 hw_perf_restore(flags);
927 out:
928 spin_unlock(&ctx->lock);
929}
930
931/*
932 * Called from scheduler to add the counters of the current task
933 * with interrupts disabled.
934 *
935 * We restore the counter value and then enable it.
936 *
937 * This does not protect us against NMI, but enable()
938 * sets the enabled bit in the control field of counter _before_
939 * accessing the counter control register. If a NMI hits, then it will
940 * keep the counter running.
941 */
942void perf_counter_task_sched_in(struct task_struct *task, int cpu)
943{
944 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
945 struct perf_counter_context *ctx = &task->perf_counter_ctx;
946
947 __perf_counter_sched_in(ctx, cpuctx, cpu);
948 cpuctx->task_ctx = ctx;
949}
950
951static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
952{
953 struct perf_counter_context *ctx = &cpuctx->ctx;
954
955 __perf_counter_sched_in(ctx, cpuctx, cpu);
956}
957
958int perf_counter_task_disable(void)
959{
960 struct task_struct *curr = current;
961 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
962 struct perf_counter *counter;
963 unsigned long flags;
964 u64 perf_flags;
965 int cpu;
966
967 if (likely(!ctx->nr_counters))
968 return 0;
969
970 local_irq_save(flags);
971 cpu = smp_processor_id();
972
973 perf_counter_task_sched_out(curr, cpu);
974
975 spin_lock(&ctx->lock);
976
977 /*
978 * Disable all the counters:
979 */
980 perf_flags = hw_perf_save_disable();
981
982 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
983 if (counter->state != PERF_COUNTER_STATE_ERROR) {
984 update_group_times(counter);
985 counter->state = PERF_COUNTER_STATE_OFF;
986 }
987 }
988
989 hw_perf_restore(perf_flags);
990
991 spin_unlock_irqrestore(&ctx->lock, flags);
992
993 return 0;
994}
995
996int perf_counter_task_enable(void)
997{
998 struct task_struct *curr = current;
999 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1000 struct perf_counter *counter;
1001 unsigned long flags;
1002 u64 perf_flags;
1003 int cpu;
1004
1005 if (likely(!ctx->nr_counters))
1006 return 0;
1007
1008 local_irq_save(flags);
1009 cpu = smp_processor_id();
1010
1011 perf_counter_task_sched_out(curr, cpu);
1012
1013 spin_lock(&ctx->lock);
1014
1015 /*
1016 * Disable all the counters:
1017 */
1018 perf_flags = hw_perf_save_disable();
1019
1020 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1021 if (counter->state > PERF_COUNTER_STATE_OFF)
1022 continue;
1023 counter->state = PERF_COUNTER_STATE_INACTIVE;
1024 counter->tstamp_enabled =
1025 ctx->time - counter->total_time_enabled;
1026 counter->hw_event.disabled = 0;
1027 }
1028 hw_perf_restore(perf_flags);
1029
1030 spin_unlock(&ctx->lock);
1031
1032 perf_counter_task_sched_in(curr, cpu);
1033
1034 local_irq_restore(flags);
1035
1036 return 0;
1037}
1038
1039/*
1040 * Round-robin a context's counters:
1041 */
1042static void rotate_ctx(struct perf_counter_context *ctx)
1043{
1044 struct perf_counter *counter;
1045 u64 perf_flags;
1046
1047 if (!ctx->nr_counters)
1048 return;
1049
1050 spin_lock(&ctx->lock);
1051 /*
1052 * Rotate the first entry last (works just fine for group counters too):
1053 */
1054 perf_flags = hw_perf_save_disable();
1055 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1056 list_move_tail(&counter->list_entry, &ctx->counter_list);
1057 break;
1058 }
1059 hw_perf_restore(perf_flags);
1060
1061 spin_unlock(&ctx->lock);
1062}
1063
1064void perf_counter_task_tick(struct task_struct *curr, int cpu)
1065{
1066 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1067 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1068 const int rotate_percpu = 0;
1069
1070 if (rotate_percpu)
1071 perf_counter_cpu_sched_out(cpuctx);
1072 perf_counter_task_sched_out(curr, cpu);
1073
1074 if (rotate_percpu)
1075 rotate_ctx(&cpuctx->ctx);
1076 rotate_ctx(ctx);
1077
1078 if (rotate_percpu)
1079 perf_counter_cpu_sched_in(cpuctx, cpu);
1080 perf_counter_task_sched_in(curr, cpu);
1081}
1082
1083/*
1084 * Cross CPU call to read the hardware counter
1085 */
1086static void __read(void *info)
1087{
1088 struct perf_counter *counter = info;
1089 struct perf_counter_context *ctx = counter->ctx;
1090 unsigned long flags;
1091
1092 local_irq_save(flags);
1093 if (ctx->is_active)
1094 update_context_time(ctx);
1095 counter->hw_ops->read(counter);
1096 update_counter_times(counter);
1097 local_irq_restore(flags);
1098}
1099
1100static u64 perf_counter_read(struct perf_counter *counter)
1101{
1102 /*
1103 * If counter is enabled and currently active on a CPU, update the
1104 * value in the counter structure:
1105 */
1106 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1107 smp_call_function_single(counter->oncpu,
1108 __read, counter, 1);
1109 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1110 update_counter_times(counter);
1111 }
1112
1113 return atomic64_read(&counter->count);
1114}
1115
1116static void put_context(struct perf_counter_context *ctx)
1117{
1118 if (ctx->task)
1119 put_task_struct(ctx->task);
1120}
1121
1122static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1123{
1124 struct perf_cpu_context *cpuctx;
1125 struct perf_counter_context *ctx;
1126 struct task_struct *task;
1127
1128 /*
1129 * If cpu is not a wildcard then this is a percpu counter:
1130 */
1131 if (cpu != -1) {
1132 /* Must be root to operate on a CPU counter: */
1133 if (!capable(CAP_SYS_ADMIN))
1134 return ERR_PTR(-EACCES);
1135
1136 if (cpu < 0 || cpu > num_possible_cpus())
1137 return ERR_PTR(-EINVAL);
1138
1139 /*
1140 * We could be clever and allow to attach a counter to an
1141 * offline CPU and activate it when the CPU comes up, but
1142 * that's for later.
1143 */
1144 if (!cpu_isset(cpu, cpu_online_map))
1145 return ERR_PTR(-ENODEV);
1146
1147 cpuctx = &per_cpu(perf_cpu_context, cpu);
1148 ctx = &cpuctx->ctx;
1149
1150 return ctx;
1151 }
1152
1153 rcu_read_lock();
1154 if (!pid)
1155 task = current;
1156 else
1157 task = find_task_by_vpid(pid);
1158 if (task)
1159 get_task_struct(task);
1160 rcu_read_unlock();
1161
1162 if (!task)
1163 return ERR_PTR(-ESRCH);
1164
1165 ctx = &task->perf_counter_ctx;
1166 ctx->task = task;
1167
1168 /* Reuse ptrace permission checks for now. */
1169 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1170 put_context(ctx);
1171 return ERR_PTR(-EACCES);
1172 }
1173
1174 return ctx;
1175}
1176
1177static void free_counter_rcu(struct rcu_head *head)
1178{
1179 struct perf_counter *counter;
1180
1181 counter = container_of(head, struct perf_counter, rcu_head);
1182 kfree(counter);
1183}
1184
1185static void perf_pending_sync(struct perf_counter *counter);
1186
1187static void free_counter(struct perf_counter *counter)
1188{
1189 perf_pending_sync(counter);
1190
1191 if (counter->destroy)
1192 counter->destroy(counter);
1193
1194 call_rcu(&counter->rcu_head, free_counter_rcu);
1195}
1196
1197/*
1198 * Called when the last reference to the file is gone.
1199 */
1200static int perf_release(struct inode *inode, struct file *file)
1201{
1202 struct perf_counter *counter = file->private_data;
1203 struct perf_counter_context *ctx = counter->ctx;
1204
1205 file->private_data = NULL;
1206
1207 mutex_lock(&ctx->mutex);
1208 mutex_lock(&counter->mutex);
1209
1210 perf_counter_remove_from_context(counter);
1211
1212 mutex_unlock(&counter->mutex);
1213 mutex_unlock(&ctx->mutex);
1214
1215 free_counter(counter);
1216 put_context(ctx);
1217
1218 return 0;
1219}
1220
1221/*
1222 * Read the performance counter - simple non blocking version for now
1223 */
1224static ssize_t
1225perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1226{
1227 u64 values[3];
1228 int n;
1229
1230 /*
1231 * Return end-of-file for a read on a counter that is in
1232 * error state (i.e. because it was pinned but it couldn't be
1233 * scheduled on to the CPU at some point).
1234 */
1235 if (counter->state == PERF_COUNTER_STATE_ERROR)
1236 return 0;
1237
1238 mutex_lock(&counter->mutex);
1239 values[0] = perf_counter_read(counter);
1240 n = 1;
1241 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1242 values[n++] = counter->total_time_enabled +
1243 atomic64_read(&counter->child_total_time_enabled);
1244 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1245 values[n++] = counter->total_time_running +
1246 atomic64_read(&counter->child_total_time_running);
1247 mutex_unlock(&counter->mutex);
1248
1249 if (count < n * sizeof(u64))
1250 return -EINVAL;
1251 count = n * sizeof(u64);
1252
1253 if (copy_to_user(buf, values, count))
1254 return -EFAULT;
1255
1256 return count;
1257}
1258
1259static ssize_t
1260perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1261{
1262 struct perf_counter *counter = file->private_data;
1263
1264 return perf_read_hw(counter, buf, count);
1265}
1266
1267static unsigned int perf_poll(struct file *file, poll_table *wait)
1268{
1269 struct perf_counter *counter = file->private_data;
1270 struct perf_mmap_data *data;
1271 unsigned int events;
1272
1273 rcu_read_lock();
1274 data = rcu_dereference(counter->data);
1275 if (data)
1276 events = atomic_xchg(&data->wakeup, 0);
1277 else
1278 events = POLL_HUP;
1279 rcu_read_unlock();
1280
1281 poll_wait(file, &counter->waitq, wait);
1282
1283 return events;
1284}
1285
1286static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1287{
1288 struct perf_counter *counter = file->private_data;
1289 int err = 0;
1290
1291 switch (cmd) {
1292 case PERF_COUNTER_IOC_ENABLE:
1293 perf_counter_enable_family(counter);
1294 break;
1295 case PERF_COUNTER_IOC_DISABLE:
1296 perf_counter_disable_family(counter);
1297 break;
1298 case PERF_COUNTER_IOC_REFRESH:
1299 perf_counter_refresh(counter, arg);
1300 break;
1301 default:
1302 err = -ENOTTY;
1303 }
1304 return err;
1305}
1306
1307/*
1308 * Callers need to ensure there can be no nesting of this function, otherwise
1309 * the seqlock logic goes bad. We can not serialize this because the arch
1310 * code calls this from NMI context.
1311 */
1312void perf_counter_update_userpage(struct perf_counter *counter)
1313{
1314 struct perf_mmap_data *data;
1315 struct perf_counter_mmap_page *userpg;
1316
1317 rcu_read_lock();
1318 data = rcu_dereference(counter->data);
1319 if (!data)
1320 goto unlock;
1321
1322 userpg = data->user_page;
1323
1324 /*
1325 * Disable preemption so as to not let the corresponding user-space
1326 * spin too long if we get preempted.
1327 */
1328 preempt_disable();
1329 ++userpg->lock;
1330 barrier();
1331 userpg->index = counter->hw.idx;
1332 userpg->offset = atomic64_read(&counter->count);
1333 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1334 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1335
1336 barrier();
1337 ++userpg->lock;
1338 preempt_enable();
1339unlock:
1340 rcu_read_unlock();
1341}
1342
1343static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1344{
1345 struct perf_counter *counter = vma->vm_file->private_data;
1346 struct perf_mmap_data *data;
1347 int ret = VM_FAULT_SIGBUS;
1348
1349 rcu_read_lock();
1350 data = rcu_dereference(counter->data);
1351 if (!data)
1352 goto unlock;
1353
1354 if (vmf->pgoff == 0) {
1355 vmf->page = virt_to_page(data->user_page);
1356 } else {
1357 int nr = vmf->pgoff - 1;
1358
1359 if ((unsigned)nr > data->nr_pages)
1360 goto unlock;
1361
1362 vmf->page = virt_to_page(data->data_pages[nr]);
1363 }
1364 get_page(vmf->page);
1365 ret = 0;
1366unlock:
1367 rcu_read_unlock();
1368
1369 return ret;
1370}
1371
1372static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1373{
1374 struct perf_mmap_data *data;
1375 unsigned long size;
1376 int i;
1377
1378 WARN_ON(atomic_read(&counter->mmap_count));
1379
1380 size = sizeof(struct perf_mmap_data);
1381 size += nr_pages * sizeof(void *);
1382
1383 data = kzalloc(size, GFP_KERNEL);
1384 if (!data)
1385 goto fail;
1386
1387 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1388 if (!data->user_page)
1389 goto fail_user_page;
1390
1391 for (i = 0; i < nr_pages; i++) {
1392 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1393 if (!data->data_pages[i])
1394 goto fail_data_pages;
1395 }
1396
1397 data->nr_pages = nr_pages;
1398
1399 rcu_assign_pointer(counter->data, data);
1400
1401 return 0;
1402
1403fail_data_pages:
1404 for (i--; i >= 0; i--)
1405 free_page((unsigned long)data->data_pages[i]);
1406
1407 free_page((unsigned long)data->user_page);
1408
1409fail_user_page:
1410 kfree(data);
1411
1412fail:
1413 return -ENOMEM;
1414}
1415
1416static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1417{
1418 struct perf_mmap_data *data = container_of(rcu_head,
1419 struct perf_mmap_data, rcu_head);
1420 int i;
1421
1422 free_page((unsigned long)data->user_page);
1423 for (i = 0; i < data->nr_pages; i++)
1424 free_page((unsigned long)data->data_pages[i]);
1425 kfree(data);
1426}
1427
1428static void perf_mmap_data_free(struct perf_counter *counter)
1429{
1430 struct perf_mmap_data *data = counter->data;
1431
1432 WARN_ON(atomic_read(&counter->mmap_count));
1433
1434 rcu_assign_pointer(counter->data, NULL);
1435 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1436}
1437
1438static void perf_mmap_open(struct vm_area_struct *vma)
1439{
1440 struct perf_counter *counter = vma->vm_file->private_data;
1441
1442 atomic_inc(&counter->mmap_count);
1443}
1444
1445static void perf_mmap_close(struct vm_area_struct *vma)
1446{
1447 struct perf_counter *counter = vma->vm_file->private_data;
1448
1449 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1450 &counter->mmap_mutex)) {
1451 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
1452 perf_mmap_data_free(counter);
1453 mutex_unlock(&counter->mmap_mutex);
1454 }
1455}
1456
1457static struct vm_operations_struct perf_mmap_vmops = {
1458 .open = perf_mmap_open,
1459 .close = perf_mmap_close,
1460 .fault = perf_mmap_fault,
1461};
1462
1463static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1464{
1465 struct perf_counter *counter = file->private_data;
1466 unsigned long vma_size;
1467 unsigned long nr_pages;
1468 unsigned long locked, lock_limit;
1469 int ret = 0;
1470
1471 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1472 return -EINVAL;
1473
1474 vma_size = vma->vm_end - vma->vm_start;
1475 nr_pages = (vma_size / PAGE_SIZE) - 1;
1476
1477 /*
1478 * If we have data pages ensure they're a power-of-two number, so we
1479 * can do bitmasks instead of modulo.
1480 */
1481 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1482 return -EINVAL;
1483
1484 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1485 return -EINVAL;
1486
1487 if (vma->vm_pgoff != 0)
1488 return -EINVAL;
1489
1490 mutex_lock(&counter->mmap_mutex);
1491 if (atomic_inc_not_zero(&counter->mmap_count)) {
1492 if (nr_pages != counter->data->nr_pages)
1493 ret = -EINVAL;
1494 goto unlock;
1495 }
1496
1497 locked = vma->vm_mm->locked_vm;
1498 locked += nr_pages + 1;
1499
1500 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1501 lock_limit >>= PAGE_SHIFT;
1502
1503 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1504 ret = -EPERM;
1505 goto unlock;
1506 }
1507
1508 WARN_ON(counter->data);
1509 ret = perf_mmap_data_alloc(counter, nr_pages);
1510 if (ret)
1511 goto unlock;
1512
1513 atomic_set(&counter->mmap_count, 1);
1514 vma->vm_mm->locked_vm += nr_pages + 1;
1515unlock:
1516 mutex_unlock(&counter->mmap_mutex);
1517
1518 vma->vm_flags &= ~VM_MAYWRITE;
1519 vma->vm_flags |= VM_RESERVED;
1520 vma->vm_ops = &perf_mmap_vmops;
1521
1522 return ret;
1523}
1524
1525static int perf_fasync(int fd, struct file *filp, int on)
1526{
1527 struct perf_counter *counter = filp->private_data;
1528 struct inode *inode = filp->f_path.dentry->d_inode;
1529 int retval;
1530
1531 mutex_lock(&inode->i_mutex);
1532 retval = fasync_helper(fd, filp, on, &counter->fasync);
1533 mutex_unlock(&inode->i_mutex);
1534
1535 if (retval < 0)
1536 return retval;
1537
1538 return 0;
1539}
1540
1541static const struct file_operations perf_fops = {
1542 .release = perf_release,
1543 .read = perf_read,
1544 .poll = perf_poll,
1545 .unlocked_ioctl = perf_ioctl,
1546 .compat_ioctl = perf_ioctl,
1547 .mmap = perf_mmap,
1548 .fasync = perf_fasync,
1549};
1550
1551/*
1552 * Perf counter wakeup
1553 *
1554 * If there's data, ensure we set the poll() state and publish everything
1555 * to user-space before waking everybody up.
1556 */
1557
1558void perf_counter_wakeup(struct perf_counter *counter)
1559{
1560 struct perf_mmap_data *data;
1561
1562 rcu_read_lock();
1563 data = rcu_dereference(counter->data);
1564 if (data) {
1565 atomic_set(&data->wakeup, POLL_IN);
1566 /*
1567 * Ensure all data writes are issued before updating the
1568 * user-space data head information. The matching rmb()
1569 * will be in userspace after reading this value.
1570 */
1571 smp_wmb();
1572 data->user_page->data_head = atomic_read(&data->head);
1573 }
1574 rcu_read_unlock();
1575
1576 wake_up_all(&counter->waitq);
1577
1578 if (counter->pending_kill) {
1579 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1580 counter->pending_kill = 0;
1581 }
1582}
1583
1584/*
1585 * Pending wakeups
1586 *
1587 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1588 *
1589 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1590 * single linked list and use cmpxchg() to add entries lockless.
1591 */
1592
1593static void perf_pending_counter(struct perf_pending_entry *entry)
1594{
1595 struct perf_counter *counter = container_of(entry,
1596 struct perf_counter, pending);
1597
1598 if (counter->pending_disable) {
1599 counter->pending_disable = 0;
1600 perf_counter_disable(counter);
1601 }
1602
1603 if (counter->pending_wakeup) {
1604 counter->pending_wakeup = 0;
1605 perf_counter_wakeup(counter);
1606 }
1607}
1608
1609#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1610
1611static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1612 PENDING_TAIL,
1613};
1614
1615static void perf_pending_queue(struct perf_pending_entry *entry,
1616 void (*func)(struct perf_pending_entry *))
1617{
1618 struct perf_pending_entry **head;
1619
1620 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1621 return;
1622
1623 entry->func = func;
1624
1625 head = &get_cpu_var(perf_pending_head);
1626
1627 do {
1628 entry->next = *head;
1629 } while (cmpxchg(head, entry->next, entry) != entry->next);
1630
1631 set_perf_counter_pending();
1632
1633 put_cpu_var(perf_pending_head);
1634}
1635
1636static int __perf_pending_run(void)
1637{
1638 struct perf_pending_entry *list;
1639 int nr = 0;
1640
1641 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1642 while (list != PENDING_TAIL) {
1643 void (*func)(struct perf_pending_entry *);
1644 struct perf_pending_entry *entry = list;
1645
1646 list = list->next;
1647
1648 func = entry->func;
1649 entry->next = NULL;
1650 /*
1651 * Ensure we observe the unqueue before we issue the wakeup,
1652 * so that we won't be waiting forever.
1653 * -- see perf_not_pending().
1654 */
1655 smp_wmb();
1656
1657 func(entry);
1658 nr++;
1659 }
1660
1661 return nr;
1662}
1663
1664static inline int perf_not_pending(struct perf_counter *counter)
1665{
1666 /*
1667 * If we flush on whatever cpu we run, there is a chance we don't
1668 * need to wait.
1669 */
1670 get_cpu();
1671 __perf_pending_run();
1672 put_cpu();
1673
1674 /*
1675 * Ensure we see the proper queue state before going to sleep
1676 * so that we do not miss the wakeup. -- see perf_pending_handle()
1677 */
1678 smp_rmb();
1679 return counter->pending.next == NULL;
1680}
1681
1682static void perf_pending_sync(struct perf_counter *counter)
1683{
1684 wait_event(counter->waitq, perf_not_pending(counter));
1685}
1686
1687void perf_counter_do_pending(void)
1688{
1689 __perf_pending_run();
1690}
1691
1692/*
1693 * Callchain support -- arch specific
1694 */
1695
1696__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1697{
1698 return NULL;
1699}
1700
1701/*
1702 * Output
1703 */
1704
1705struct perf_output_handle {
1706 struct perf_counter *counter;
1707 struct perf_mmap_data *data;
1708 unsigned int offset;
1709 unsigned int head;
1710 int wakeup;
1711 int nmi;
1712 int overflow;
1713};
1714
1715static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1716{
1717 if (handle->nmi) {
1718 handle->counter->pending_wakeup = 1;
1719 perf_pending_queue(&handle->counter->pending,
1720 perf_pending_counter);
1721 } else
1722 perf_counter_wakeup(handle->counter);
1723}
1724
1725static int perf_output_begin(struct perf_output_handle *handle,
1726 struct perf_counter *counter, unsigned int size,
1727 int nmi, int overflow)
1728{
1729 struct perf_mmap_data *data;
1730 unsigned int offset, head;
1731
1732 rcu_read_lock();
1733 data = rcu_dereference(counter->data);
1734 if (!data)
1735 goto out;
1736
1737 handle->counter = counter;
1738 handle->nmi = nmi;
1739 handle->overflow = overflow;
1740
1741 if (!data->nr_pages)
1742 goto fail;
1743
1744 do {
1745 offset = head = atomic_read(&data->head);
1746 head += size;
1747 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1748
1749 handle->data = data;
1750 handle->offset = offset;
1751 handle->head = head;
1752 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1753
1754 return 0;
1755
1756fail:
1757 __perf_output_wakeup(handle);
1758out:
1759 rcu_read_unlock();
1760
1761 return -ENOSPC;
1762}
1763
1764static void perf_output_copy(struct perf_output_handle *handle,
1765 void *buf, unsigned int len)
1766{
1767 unsigned int pages_mask;
1768 unsigned int offset;
1769 unsigned int size;
1770 void **pages;
1771
1772 offset = handle->offset;
1773 pages_mask = handle->data->nr_pages - 1;
1774 pages = handle->data->data_pages;
1775
1776 do {
1777 unsigned int page_offset;
1778 int nr;
1779
1780 nr = (offset >> PAGE_SHIFT) & pages_mask;
1781 page_offset = offset & (PAGE_SIZE - 1);
1782 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1783
1784 memcpy(pages[nr] + page_offset, buf, size);
1785
1786 len -= size;
1787 buf += size;
1788 offset += size;
1789 } while (len);
1790
1791 handle->offset = offset;
1792
1793 WARN_ON_ONCE(handle->offset > handle->head);
1794}
1795
1796#define perf_output_put(handle, x) \
1797 perf_output_copy((handle), &(x), sizeof(x))
1798
1799static void perf_output_end(struct perf_output_handle *handle)
1800{
1801 int wakeup_events = handle->counter->hw_event.wakeup_events;
1802
1803 if (handle->overflow && wakeup_events) {
1804 int events = atomic_inc_return(&handle->data->events);
1805 if (events >= wakeup_events) {
1806 atomic_sub(wakeup_events, &handle->data->events);
1807 __perf_output_wakeup(handle);
1808 }
1809 } else if (handle->wakeup)
1810 __perf_output_wakeup(handle);
1811 rcu_read_unlock();
1812}
1813
1814static void perf_counter_output(struct perf_counter *counter,
1815 int nmi, struct pt_regs *regs)
1816{
1817 int ret;
1818 u64 record_type = counter->hw_event.record_type;
1819 struct perf_output_handle handle;
1820 struct perf_event_header header;
1821 u64 ip;
1822 struct {
1823 u32 pid, tid;
1824 } tid_entry;
1825 struct {
1826 u64 event;
1827 u64 counter;
1828 } group_entry;
1829 struct perf_callchain_entry *callchain = NULL;
1830 int callchain_size = 0;
1831 u64 time;
1832
1833 header.type = PERF_EVENT_COUNTER_OVERFLOW;
1834 header.size = sizeof(header);
1835
1836 if (record_type & PERF_RECORD_IP) {
1837 ip = instruction_pointer(regs);
1838 header.type |= __PERF_EVENT_IP;
1839 header.size += sizeof(ip);
1840 }
1841
1842 if (record_type & PERF_RECORD_TID) {
1843 /* namespace issues */
1844 tid_entry.pid = current->group_leader->pid;
1845 tid_entry.tid = current->pid;
1846
1847 header.type |= __PERF_EVENT_TID;
1848 header.size += sizeof(tid_entry);
1849 }
1850
1851 if (record_type & PERF_RECORD_GROUP) {
1852 header.type |= __PERF_EVENT_GROUP;
1853 header.size += sizeof(u64) +
1854 counter->nr_siblings * sizeof(group_entry);
1855 }
1856
1857 if (record_type & PERF_RECORD_CALLCHAIN) {
1858 callchain = perf_callchain(regs);
1859
1860 if (callchain) {
1861 callchain_size = (1 + callchain->nr) * sizeof(u64);
1862
1863 header.type |= __PERF_EVENT_CALLCHAIN;
1864 header.size += callchain_size;
1865 }
1866 }
1867
1868 if (record_type & PERF_RECORD_TIME) {
1869 /*
1870 * Maybe do better on x86 and provide cpu_clock_nmi()
1871 */
1872 time = sched_clock();
1873
1874 header.type |= __PERF_EVENT_TIME;
1875 header.size += sizeof(u64);
1876 }
1877
1878 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1879 if (ret)
1880 return;
1881
1882 perf_output_put(&handle, header);
1883
1884 if (record_type & PERF_RECORD_IP)
1885 perf_output_put(&handle, ip);
1886
1887 if (record_type & PERF_RECORD_TID)
1888 perf_output_put(&handle, tid_entry);
1889
1890 if (record_type & PERF_RECORD_GROUP) {
1891 struct perf_counter *leader, *sub;
1892 u64 nr = counter->nr_siblings;
1893
1894 perf_output_put(&handle, nr);
1895
1896 leader = counter->group_leader;
1897 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1898 if (sub != counter)
1899 sub->hw_ops->read(sub);
1900
1901 group_entry.event = sub->hw_event.config;
1902 group_entry.counter = atomic64_read(&sub->count);
1903
1904 perf_output_put(&handle, group_entry);
1905 }
1906 }
1907
1908 if (callchain)
1909 perf_output_copy(&handle, callchain, callchain_size);
1910
1911 if (record_type & PERF_RECORD_TIME)
1912 perf_output_put(&handle, time);
1913
1914 perf_output_end(&handle);
1915}
1916
1917/*
1918 * mmap tracking
1919 */
1920
1921struct perf_mmap_event {
1922 struct file *file;
1923 char *file_name;
1924 int file_size;
1925
1926 struct {
1927 struct perf_event_header header;
1928
1929 u32 pid;
1930 u32 tid;
1931 u64 start;
1932 u64 len;
1933 u64 pgoff;
1934 } event;
1935};
1936
1937static void perf_counter_mmap_output(struct perf_counter *counter,
1938 struct perf_mmap_event *mmap_event)
1939{
1940 struct perf_output_handle handle;
1941 int size = mmap_event->event.header.size;
1942 int ret = perf_output_begin(&handle, counter, size, 0, 0);
1943
1944 if (ret)
1945 return;
1946
1947 perf_output_put(&handle, mmap_event->event);
1948 perf_output_copy(&handle, mmap_event->file_name,
1949 mmap_event->file_size);
1950 perf_output_end(&handle);
1951}
1952
1953static int perf_counter_mmap_match(struct perf_counter *counter,
1954 struct perf_mmap_event *mmap_event)
1955{
1956 if (counter->hw_event.mmap &&
1957 mmap_event->event.header.type == PERF_EVENT_MMAP)
1958 return 1;
1959
1960 if (counter->hw_event.munmap &&
1961 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
1962 return 1;
1963
1964 return 0;
1965}
1966
1967static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
1968 struct perf_mmap_event *mmap_event)
1969{
1970 struct perf_counter *counter;
1971
1972 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1973 return;
1974
1975 rcu_read_lock();
1976 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1977 if (perf_counter_mmap_match(counter, mmap_event))
1978 perf_counter_mmap_output(counter, mmap_event);
1979 }
1980 rcu_read_unlock();
1981}
1982
1983static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
1984{
1985 struct perf_cpu_context *cpuctx;
1986 struct file *file = mmap_event->file;
1987 unsigned int size;
1988 char tmp[16];
1989 char *buf = NULL;
1990 char *name;
1991
1992 if (file) {
1993 buf = kzalloc(PATH_MAX, GFP_KERNEL);
1994 if (!buf) {
1995 name = strncpy(tmp, "//enomem", sizeof(tmp));
1996 goto got_name;
1997 }
1998 name = dentry_path(file->f_dentry, buf, PATH_MAX);
1999 if (IS_ERR(name)) {
2000 name = strncpy(tmp, "//toolong", sizeof(tmp));
2001 goto got_name;
2002 }
2003 } else {
2004 name = strncpy(tmp, "//anon", sizeof(tmp));
2005 goto got_name;
2006 }
2007
2008got_name:
2009 size = ALIGN(strlen(name), sizeof(u64));
2010
2011 mmap_event->file_name = name;
2012 mmap_event->file_size = size;
2013
2014 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2015
2016 cpuctx = &get_cpu_var(perf_cpu_context);
2017 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2018 put_cpu_var(perf_cpu_context);
2019
2020 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2021
2022 kfree(buf);
2023}
2024
2025void perf_counter_mmap(unsigned long addr, unsigned long len,
2026 unsigned long pgoff, struct file *file)
2027{
2028 struct perf_mmap_event mmap_event = {
2029 .file = file,
2030 .event = {
2031 .header = { .type = PERF_EVENT_MMAP, },
2032 .pid = current->group_leader->pid,
2033 .tid = current->pid,
2034 .start = addr,
2035 .len = len,
2036 .pgoff = pgoff,
2037 },
2038 };
2039
2040 perf_counter_mmap_event(&mmap_event);
2041}
2042
2043void perf_counter_munmap(unsigned long addr, unsigned long len,
2044 unsigned long pgoff, struct file *file)
2045{
2046 struct perf_mmap_event mmap_event = {
2047 .file = file,
2048 .event = {
2049 .header = { .type = PERF_EVENT_MUNMAP, },
2050 .pid = current->group_leader->pid,
2051 .tid = current->pid,
2052 .start = addr,
2053 .len = len,
2054 .pgoff = pgoff,
2055 },
2056 };
2057
2058 perf_counter_mmap_event(&mmap_event);
2059}
2060
2061/*
2062 * Generic counter overflow handling.
2063 */
2064
2065int perf_counter_overflow(struct perf_counter *counter,
2066 int nmi, struct pt_regs *regs)
2067{
2068 int events = atomic_read(&counter->event_limit);
2069 int ret = 0;
2070
2071 counter->pending_kill = POLL_IN;
2072 if (events && atomic_dec_and_test(&counter->event_limit)) {
2073 ret = 1;
2074 counter->pending_kill = POLL_HUP;
2075 if (nmi) {
2076 counter->pending_disable = 1;
2077 perf_pending_queue(&counter->pending,
2078 perf_pending_counter);
2079 } else
2080 perf_counter_disable(counter);
2081 }
2082
2083 perf_counter_output(counter, nmi, regs);
2084 return ret;
2085}
2086
2087/*
2088 * Generic software counter infrastructure
2089 */
2090
2091static void perf_swcounter_update(struct perf_counter *counter)
2092{
2093 struct hw_perf_counter *hwc = &counter->hw;
2094 u64 prev, now;
2095 s64 delta;
2096
2097again:
2098 prev = atomic64_read(&hwc->prev_count);
2099 now = atomic64_read(&hwc->count);
2100 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2101 goto again;
2102
2103 delta = now - prev;
2104
2105 atomic64_add(delta, &counter->count);
2106 atomic64_sub(delta, &hwc->period_left);
2107}
2108
2109static void perf_swcounter_set_period(struct perf_counter *counter)
2110{
2111 struct hw_perf_counter *hwc = &counter->hw;
2112 s64 left = atomic64_read(&hwc->period_left);
2113 s64 period = hwc->irq_period;
2114
2115 if (unlikely(left <= -period)) {
2116 left = period;
2117 atomic64_set(&hwc->period_left, left);
2118 }
2119
2120 if (unlikely(left <= 0)) {
2121 left += period;
2122 atomic64_add(period, &hwc->period_left);
2123 }
2124
2125 atomic64_set(&hwc->prev_count, -left);
2126 atomic64_set(&hwc->count, -left);
2127}
2128
2129static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2130{
2131 enum hrtimer_restart ret = HRTIMER_RESTART;
2132 struct perf_counter *counter;
2133 struct pt_regs *regs;
2134
2135 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2136 counter->hw_ops->read(counter);
2137
2138 regs = get_irq_regs();
2139 /*
2140 * In case we exclude kernel IPs or are somehow not in interrupt
2141 * context, provide the next best thing, the user IP.
2142 */
2143 if ((counter->hw_event.exclude_kernel || !regs) &&
2144 !counter->hw_event.exclude_user)
2145 regs = task_pt_regs(current);
2146
2147 if (regs) {
2148 if (perf_counter_overflow(counter, 0, regs))
2149 ret = HRTIMER_NORESTART;
2150 }
2151
2152 hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2153
2154 return ret;
2155}
2156
2157static void perf_swcounter_overflow(struct perf_counter *counter,
2158 int nmi, struct pt_regs *regs)
2159{
2160 perf_swcounter_update(counter);
2161 perf_swcounter_set_period(counter);
2162 if (perf_counter_overflow(counter, nmi, regs))
2163 /* soft-disable the counter */
2164 ;
2165
2166}
2167
2168static int perf_swcounter_match(struct perf_counter *counter,
2169 enum perf_event_types type,
2170 u32 event, struct pt_regs *regs)
2171{
2172 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2173 return 0;
2174
2175 if (perf_event_raw(&counter->hw_event))
2176 return 0;
2177
2178 if (perf_event_type(&counter->hw_event) != type)
2179 return 0;
2180
2181 if (perf_event_id(&counter->hw_event) != event)
2182 return 0;
2183
2184 if (counter->hw_event.exclude_user && user_mode(regs))
2185 return 0;
2186
2187 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2188 return 0;
2189
2190 return 1;
2191}
2192
2193static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2194 int nmi, struct pt_regs *regs)
2195{
2196 int neg = atomic64_add_negative(nr, &counter->hw.count);
2197 if (counter->hw.irq_period && !neg)
2198 perf_swcounter_overflow(counter, nmi, regs);
2199}
2200
2201static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2202 enum perf_event_types type, u32 event,
2203 u64 nr, int nmi, struct pt_regs *regs)
2204{
2205 struct perf_counter *counter;
2206
2207 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2208 return;
2209
2210 rcu_read_lock();
2211 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2212 if (perf_swcounter_match(counter, type, event, regs))
2213 perf_swcounter_add(counter, nr, nmi, regs);
2214 }
2215 rcu_read_unlock();
2216}
2217
2218static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2219{
2220 if (in_nmi())
2221 return &cpuctx->recursion[3];
2222
2223 if (in_irq())
2224 return &cpuctx->recursion[2];
2225
2226 if (in_softirq())
2227 return &cpuctx->recursion[1];
2228
2229 return &cpuctx->recursion[0];
2230}
2231
2232static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2233 u64 nr, int nmi, struct pt_regs *regs)
2234{
2235 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2236 int *recursion = perf_swcounter_recursion_context(cpuctx);
2237
2238 if (*recursion)
2239 goto out;
2240
2241 (*recursion)++;
2242 barrier();
2243
2244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
2245 if (cpuctx->task_ctx) {
2246 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2247 nr, nmi, regs);
2248 }
2249
2250 barrier();
2251 (*recursion)--;
2252
2253out:
2254 put_cpu_var(perf_cpu_context);
2255}
2256
2257void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
2258{
2259 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
2260}
2261
2262static void perf_swcounter_read(struct perf_counter *counter)
2263{
2264 perf_swcounter_update(counter);
2265}
2266
2267static int perf_swcounter_enable(struct perf_counter *counter)
2268{
2269 perf_swcounter_set_period(counter);
2270 return 0;
2271}
2272
2273static void perf_swcounter_disable(struct perf_counter *counter)
2274{
2275 perf_swcounter_update(counter);
2276}
2277
2278static const struct hw_perf_counter_ops perf_ops_generic = {
2279 .enable = perf_swcounter_enable,
2280 .disable = perf_swcounter_disable,
2281 .read = perf_swcounter_read,
2282};
2283
2284/*
2285 * Software counter: cpu wall time clock
2286 */
2287
2288static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2289{
2290 int cpu = raw_smp_processor_id();
2291 s64 prev;
2292 u64 now;
2293
2294 now = cpu_clock(cpu);
2295 prev = atomic64_read(&counter->hw.prev_count);
2296 atomic64_set(&counter->hw.prev_count, now);
2297 atomic64_add(now - prev, &counter->count);
2298}
2299
2300static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2301{
2302 struct hw_perf_counter *hwc = &counter->hw;
2303 int cpu = raw_smp_processor_id();
2304
2305 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2306 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2307 hwc->hrtimer.function = perf_swcounter_hrtimer;
2308 if (hwc->irq_period) {
2309 __hrtimer_start_range_ns(&hwc->hrtimer,
2310 ns_to_ktime(hwc->irq_period), 0,
2311 HRTIMER_MODE_REL, 0);
2312 }
2313
2314 return 0;
2315}
2316
2317static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2318{
2319 hrtimer_cancel(&counter->hw.hrtimer);
2320 cpu_clock_perf_counter_update(counter);
2321}
2322
2323static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2324{
2325 cpu_clock_perf_counter_update(counter);
2326}
2327
2328static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
2329 .enable = cpu_clock_perf_counter_enable,
2330 .disable = cpu_clock_perf_counter_disable,
2331 .read = cpu_clock_perf_counter_read,
2332};
2333
2334/*
2335 * Software counter: task time clock
2336 */
2337
2338static void task_clock_perf_counter_update(struct perf_counter *counter)
2339{
2340 u64 prev, now;
2341 s64 delta;
2342
2343 now = counter->ctx->time;
2344
2345 prev = atomic64_xchg(&counter->hw.prev_count, now);
2346 delta = now - prev;
2347 atomic64_add(delta, &counter->count);
2348}
2349
2350static int task_clock_perf_counter_enable(struct perf_counter *counter)
2351{
2352 struct hw_perf_counter *hwc = &counter->hw;
2353 u64 now;
2354
2355 now = counter->ctx->time;
2356
2357 atomic64_set(&hwc->prev_count, now);
2358 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2359 hwc->hrtimer.function = perf_swcounter_hrtimer;
2360 if (hwc->irq_period) {
2361 __hrtimer_start_range_ns(&hwc->hrtimer,
2362 ns_to_ktime(hwc->irq_period), 0,
2363 HRTIMER_MODE_REL, 0);
2364 }
2365
2366 return 0;
2367}
2368
2369static void task_clock_perf_counter_disable(struct perf_counter *counter)
2370{
2371 hrtimer_cancel(&counter->hw.hrtimer);
2372 task_clock_perf_counter_update(counter);
2373}
2374
2375static void task_clock_perf_counter_read(struct perf_counter *counter)
2376{
2377 update_context_time(counter->ctx);
2378 task_clock_perf_counter_update(counter);
2379}
2380
2381static const struct hw_perf_counter_ops perf_ops_task_clock = {
2382 .enable = task_clock_perf_counter_enable,
2383 .disable = task_clock_perf_counter_disable,
2384 .read = task_clock_perf_counter_read,
2385};
2386
2387/*
2388 * Software counter: cpu migrations
2389 */
2390
2391static inline u64 get_cpu_migrations(struct perf_counter *counter)
2392{
2393 struct task_struct *curr = counter->ctx->task;
2394
2395 if (curr)
2396 return curr->se.nr_migrations;
2397 return cpu_nr_migrations(smp_processor_id());
2398}
2399
2400static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2401{
2402 u64 prev, now;
2403 s64 delta;
2404
2405 prev = atomic64_read(&counter->hw.prev_count);
2406 now = get_cpu_migrations(counter);
2407
2408 atomic64_set(&counter->hw.prev_count, now);
2409
2410 delta = now - prev;
2411
2412 atomic64_add(delta, &counter->count);
2413}
2414
2415static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2416{
2417 cpu_migrations_perf_counter_update(counter);
2418}
2419
2420static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2421{
2422 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2423 atomic64_set(&counter->hw.prev_count,
2424 get_cpu_migrations(counter));
2425 return 0;
2426}
2427
2428static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2429{
2430 cpu_migrations_perf_counter_update(counter);
2431}
2432
2433static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
2434 .enable = cpu_migrations_perf_counter_enable,
2435 .disable = cpu_migrations_perf_counter_disable,
2436 .read = cpu_migrations_perf_counter_read,
2437};
2438
2439#ifdef CONFIG_EVENT_PROFILE
2440void perf_tpcounter_event(int event_id)
2441{
2442 struct pt_regs *regs = get_irq_regs();
2443
2444 if (!regs)
2445 regs = task_pt_regs(current);
2446
2447 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
2448}
2449
2450extern int ftrace_profile_enable(int);
2451extern void ftrace_profile_disable(int);
2452
2453static void tp_perf_counter_destroy(struct perf_counter *counter)
2454{
2455 ftrace_profile_disable(perf_event_id(&counter->hw_event));
2456}
2457
2458static const struct hw_perf_counter_ops *
2459tp_perf_counter_init(struct perf_counter *counter)
2460{
2461 int event_id = perf_event_id(&counter->hw_event);
2462 int ret;
2463
2464 ret = ftrace_profile_enable(event_id);
2465 if (ret)
2466 return NULL;
2467
2468 counter->destroy = tp_perf_counter_destroy;
2469 counter->hw.irq_period = counter->hw_event.irq_period;
2470
2471 return &perf_ops_generic;
2472}
2473#else
2474static const struct hw_perf_counter_ops *
2475tp_perf_counter_init(struct perf_counter *counter)
2476{
2477 return NULL;
2478}
2479#endif
2480
2481static const struct hw_perf_counter_ops *
2482sw_perf_counter_init(struct perf_counter *counter)
2483{
2484 struct perf_counter_hw_event *hw_event = &counter->hw_event;
2485 const struct hw_perf_counter_ops *hw_ops = NULL;
2486 struct hw_perf_counter *hwc = &counter->hw;
2487
2488 /*
2489 * Software counters (currently) can't in general distinguish
2490 * between user, kernel and hypervisor events.
2491 * However, context switches and cpu migrations are considered
2492 * to be kernel events, and page faults are never hypervisor
2493 * events.
2494 */
2495 switch (perf_event_id(&counter->hw_event)) {
2496 case PERF_COUNT_CPU_CLOCK:
2497 hw_ops = &perf_ops_cpu_clock;
2498
2499 if (hw_event->irq_period && hw_event->irq_period < 10000)
2500 hw_event->irq_period = 10000;
2501 break;
2502 case PERF_COUNT_TASK_CLOCK:
2503 /*
2504 * If the user instantiates this as a per-cpu counter,
2505 * use the cpu_clock counter instead.
2506 */
2507 if (counter->ctx->task)
2508 hw_ops = &perf_ops_task_clock;
2509 else
2510 hw_ops = &perf_ops_cpu_clock;
2511
2512 if (hw_event->irq_period && hw_event->irq_period < 10000)
2513 hw_event->irq_period = 10000;
2514 break;
2515 case PERF_COUNT_PAGE_FAULTS:
2516 case PERF_COUNT_PAGE_FAULTS_MIN:
2517 case PERF_COUNT_PAGE_FAULTS_MAJ:
2518 case PERF_COUNT_CONTEXT_SWITCHES:
2519 hw_ops = &perf_ops_generic;
2520 break;
2521 case PERF_COUNT_CPU_MIGRATIONS:
2522 if (!counter->hw_event.exclude_kernel)
2523 hw_ops = &perf_ops_cpu_migrations;
2524 break;
2525 }
2526
2527 if (hw_ops)
2528 hwc->irq_period = hw_event->irq_period;
2529
2530 return hw_ops;
2531}
2532
2533/*
2534 * Allocate and initialize a counter structure
2535 */
2536static struct perf_counter *
2537perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2538 int cpu,
2539 struct perf_counter_context *ctx,
2540 struct perf_counter *group_leader,
2541 gfp_t gfpflags)
2542{
2543 const struct hw_perf_counter_ops *hw_ops;
2544 struct perf_counter *counter;
2545 long err;
2546
2547 counter = kzalloc(sizeof(*counter), gfpflags);
2548 if (!counter)
2549 return ERR_PTR(-ENOMEM);
2550
2551 /*
2552 * Single counters are their own group leaders, with an
2553 * empty sibling list:
2554 */
2555 if (!group_leader)
2556 group_leader = counter;
2557
2558 mutex_init(&counter->mutex);
2559 INIT_LIST_HEAD(&counter->list_entry);
2560 INIT_LIST_HEAD(&counter->event_entry);
2561 INIT_LIST_HEAD(&counter->sibling_list);
2562 init_waitqueue_head(&counter->waitq);
2563
2564 mutex_init(&counter->mmap_mutex);
2565
2566 INIT_LIST_HEAD(&counter->child_list);
2567
2568 counter->cpu = cpu;
2569 counter->hw_event = *hw_event;
2570 counter->group_leader = group_leader;
2571 counter->hw_ops = NULL;
2572 counter->ctx = ctx;
2573
2574 counter->state = PERF_COUNTER_STATE_INACTIVE;
2575 if (hw_event->disabled)
2576 counter->state = PERF_COUNTER_STATE_OFF;
2577
2578 hw_ops = NULL;
2579
2580 if (perf_event_raw(hw_event)) {
2581 hw_ops = hw_perf_counter_init(counter);
2582 goto done;
2583 }
2584
2585 switch (perf_event_type(hw_event)) {
2586 case PERF_TYPE_HARDWARE:
2587 hw_ops = hw_perf_counter_init(counter);
2588 break;
2589
2590 case PERF_TYPE_SOFTWARE:
2591 hw_ops = sw_perf_counter_init(counter);
2592 break;
2593
2594 case PERF_TYPE_TRACEPOINT:
2595 hw_ops = tp_perf_counter_init(counter);
2596 break;
2597 }
2598done:
2599 err = 0;
2600 if (!hw_ops)
2601 err = -EINVAL;
2602 else if (IS_ERR(hw_ops))
2603 err = PTR_ERR(hw_ops);
2604
2605 if (err) {
2606 kfree(counter);
2607 return ERR_PTR(err);
2608 }
2609
2610 counter->hw_ops = hw_ops;
2611
2612 return counter;
2613}
2614
2615/**
2616 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2617 *
2618 * @hw_event_uptr: event type attributes for monitoring/sampling
2619 * @pid: target pid
2620 * @cpu: target cpu
2621 * @group_fd: group leader counter fd
2622 */
2623SYSCALL_DEFINE5(perf_counter_open,
2624 const struct perf_counter_hw_event __user *, hw_event_uptr,
2625 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2626{
2627 struct perf_counter *counter, *group_leader;
2628 struct perf_counter_hw_event hw_event;
2629 struct perf_counter_context *ctx;
2630 struct file *counter_file = NULL;
2631 struct file *group_file = NULL;
2632 int fput_needed = 0;
2633 int fput_needed2 = 0;
2634 int ret;
2635
2636 /* for future expandability... */
2637 if (flags)
2638 return -EINVAL;
2639
2640 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2641 return -EFAULT;
2642
2643 /*
2644 * Get the target context (task or percpu):
2645 */
2646 ctx = find_get_context(pid, cpu);
2647 if (IS_ERR(ctx))
2648 return PTR_ERR(ctx);
2649
2650 /*
2651 * Look up the group leader (we will attach this counter to it):
2652 */
2653 group_leader = NULL;
2654 if (group_fd != -1) {
2655 ret = -EINVAL;
2656 group_file = fget_light(group_fd, &fput_needed);
2657 if (!group_file)
2658 goto err_put_context;
2659 if (group_file->f_op != &perf_fops)
2660 goto err_put_context;
2661
2662 group_leader = group_file->private_data;
2663 /*
2664 * Do not allow a recursive hierarchy (this new sibling
2665 * becoming part of another group-sibling):
2666 */
2667 if (group_leader->group_leader != group_leader)
2668 goto err_put_context;
2669 /*
2670 * Do not allow to attach to a group in a different
2671 * task or CPU context:
2672 */
2673 if (group_leader->ctx != ctx)
2674 goto err_put_context;
2675 /*
2676 * Only a group leader can be exclusive or pinned
2677 */
2678 if (hw_event.exclusive || hw_event.pinned)
2679 goto err_put_context;
2680 }
2681
2682 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2683 GFP_KERNEL);
2684 ret = PTR_ERR(counter);
2685 if (IS_ERR(counter))
2686 goto err_put_context;
2687
2688 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2689 if (ret < 0)
2690 goto err_free_put_context;
2691
2692 counter_file = fget_light(ret, &fput_needed2);
2693 if (!counter_file)
2694 goto err_free_put_context;
2695
2696 counter->filp = counter_file;
2697 mutex_lock(&ctx->mutex);
2698 perf_install_in_context(ctx, counter, cpu);
2699 mutex_unlock(&ctx->mutex);
2700
2701 fput_light(counter_file, fput_needed2);
2702
2703out_fput:
2704 fput_light(group_file, fput_needed);
2705
2706 return ret;
2707
2708err_free_put_context:
2709 kfree(counter);
2710
2711err_put_context:
2712 put_context(ctx);
2713
2714 goto out_fput;
2715}
2716
2717/*
2718 * Initialize the perf_counter context in a task_struct:
2719 */
2720static void
2721__perf_counter_init_context(struct perf_counter_context *ctx,
2722 struct task_struct *task)
2723{
2724 memset(ctx, 0, sizeof(*ctx));
2725 spin_lock_init(&ctx->lock);
2726 mutex_init(&ctx->mutex);
2727 INIT_LIST_HEAD(&ctx->counter_list);
2728 INIT_LIST_HEAD(&ctx->event_list);
2729 ctx->task = task;
2730}
2731
2732/*
2733 * inherit a counter from parent task to child task:
2734 */
2735static struct perf_counter *
2736inherit_counter(struct perf_counter *parent_counter,
2737 struct task_struct *parent,
2738 struct perf_counter_context *parent_ctx,
2739 struct task_struct *child,
2740 struct perf_counter *group_leader,
2741 struct perf_counter_context *child_ctx)
2742{
2743 struct perf_counter *child_counter;
2744
2745 /*
2746 * Instead of creating recursive hierarchies of counters,
2747 * we link inherited counters back to the original parent,
2748 * which has a filp for sure, which we use as the reference
2749 * count:
2750 */
2751 if (parent_counter->parent)
2752 parent_counter = parent_counter->parent;
2753
2754 child_counter = perf_counter_alloc(&parent_counter->hw_event,
2755 parent_counter->cpu, child_ctx,
2756 group_leader, GFP_KERNEL);
2757 if (IS_ERR(child_counter))
2758 return child_counter;
2759
2760 /*
2761 * Link it up in the child's context:
2762 */
2763 child_counter->task = child;
2764 add_counter_to_ctx(child_counter, child_ctx);
2765
2766 child_counter->parent = parent_counter;
2767 /*
2768 * inherit into child's child as well:
2769 */
2770 child_counter->hw_event.inherit = 1;
2771
2772 /*
2773 * Get a reference to the parent filp - we will fput it
2774 * when the child counter exits. This is safe to do because
2775 * we are in the parent and we know that the filp still
2776 * exists and has a nonzero count:
2777 */
2778 atomic_long_inc(&parent_counter->filp->f_count);
2779
2780 /*
2781 * Link this into the parent counter's child list
2782 */
2783 mutex_lock(&parent_counter->mutex);
2784 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2785
2786 /*
2787 * Make the child state follow the state of the parent counter,
2788 * not its hw_event.disabled bit. We hold the parent's mutex,
2789 * so we won't race with perf_counter_{en,dis}able_family.
2790 */
2791 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2792 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2793 else
2794 child_counter->state = PERF_COUNTER_STATE_OFF;
2795
2796 mutex_unlock(&parent_counter->mutex);
2797
2798 return child_counter;
2799}
2800
2801static int inherit_group(struct perf_counter *parent_counter,
2802 struct task_struct *parent,
2803 struct perf_counter_context *parent_ctx,
2804 struct task_struct *child,
2805 struct perf_counter_context *child_ctx)
2806{
2807 struct perf_counter *leader;
2808 struct perf_counter *sub;
2809 struct perf_counter *child_ctr;
2810
2811 leader = inherit_counter(parent_counter, parent, parent_ctx,
2812 child, NULL, child_ctx);
2813 if (IS_ERR(leader))
2814 return PTR_ERR(leader);
2815 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2816 child_ctr = inherit_counter(sub, parent, parent_ctx,
2817 child, leader, child_ctx);
2818 if (IS_ERR(child_ctr))
2819 return PTR_ERR(child_ctr);
2820 }
2821 return 0;
2822}
2823
2824static void sync_child_counter(struct perf_counter *child_counter,
2825 struct perf_counter *parent_counter)
2826{
2827 u64 parent_val, child_val;
2828
2829 parent_val = atomic64_read(&parent_counter->count);
2830 child_val = atomic64_read(&child_counter->count);
2831
2832 /*
2833 * Add back the child's count to the parent's count:
2834 */
2835 atomic64_add(child_val, &parent_counter->count);
2836 atomic64_add(child_counter->total_time_enabled,
2837 &parent_counter->child_total_time_enabled);
2838 atomic64_add(child_counter->total_time_running,
2839 &parent_counter->child_total_time_running);
2840
2841 /*
2842 * Remove this counter from the parent's list
2843 */
2844 mutex_lock(&parent_counter->mutex);
2845 list_del_init(&child_counter->child_list);
2846 mutex_unlock(&parent_counter->mutex);
2847
2848 /*
2849 * Release the parent counter, if this was the last
2850 * reference to it.
2851 */
2852 fput(parent_counter->filp);
2853}
2854
2855static void
2856__perf_counter_exit_task(struct task_struct *child,
2857 struct perf_counter *child_counter,
2858 struct perf_counter_context *child_ctx)
2859{
2860 struct perf_counter *parent_counter;
2861 struct perf_counter *sub, *tmp;
2862
2863 /*
2864 * If we do not self-reap then we have to wait for the
2865 * child task to unschedule (it will happen for sure),
2866 * so that its counter is at its final count. (This
2867 * condition triggers rarely - child tasks usually get
2868 * off their CPU before the parent has a chance to
2869 * get this far into the reaping action)
2870 */
2871 if (child != current) {
2872 wait_task_inactive(child, 0);
2873 list_del_init(&child_counter->list_entry);
2874 update_counter_times(child_counter);
2875 } else {
2876 struct perf_cpu_context *cpuctx;
2877 unsigned long flags;
2878 u64 perf_flags;
2879
2880 /*
2881 * Disable and unlink this counter.
2882 *
2883 * Be careful about zapping the list - IRQ/NMI context
2884 * could still be processing it:
2885 */
2886 local_irq_save(flags);
2887 perf_flags = hw_perf_save_disable();
2888
2889 cpuctx = &__get_cpu_var(perf_cpu_context);
2890
2891 group_sched_out(child_counter, cpuctx, child_ctx);
2892 update_counter_times(child_counter);
2893
2894 list_del_init(&child_counter->list_entry);
2895
2896 child_ctx->nr_counters--;
2897
2898 hw_perf_restore(perf_flags);
2899 local_irq_restore(flags);
2900 }
2901
2902 parent_counter = child_counter->parent;
2903 /*
2904 * It can happen that parent exits first, and has counters
2905 * that are still around due to the child reference. These
2906 * counters need to be zapped - but otherwise linger.
2907 */
2908 if (parent_counter) {
2909 sync_child_counter(child_counter, parent_counter);
2910 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2911 list_entry) {
2912 if (sub->parent) {
2913 sync_child_counter(sub, sub->parent);
2914 free_counter(sub);
2915 }
2916 }
2917 free_counter(child_counter);
2918 }
2919}
2920
2921/*
2922 * When a child task exits, feed back counter values to parent counters.
2923 *
2924 * Note: we may be running in child context, but the PID is not hashed
2925 * anymore so new counters will not be added.
2926 */
2927void perf_counter_exit_task(struct task_struct *child)
2928{
2929 struct perf_counter *child_counter, *tmp;
2930 struct perf_counter_context *child_ctx;
2931
2932 child_ctx = &child->perf_counter_ctx;
2933
2934 if (likely(!child_ctx->nr_counters))
2935 return;
2936
2937 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2938 list_entry)
2939 __perf_counter_exit_task(child, child_counter, child_ctx);
2940}
2941
2942/*
2943 * Initialize the perf_counter context in task_struct
2944 */
2945void perf_counter_init_task(struct task_struct *child)
2946{
2947 struct perf_counter_context *child_ctx, *parent_ctx;
2948 struct perf_counter *counter;
2949 struct task_struct *parent = current;
2950
2951 child_ctx = &child->perf_counter_ctx;
2952 parent_ctx = &parent->perf_counter_ctx;
2953
2954 __perf_counter_init_context(child_ctx, child);
2955
2956 /*
2957 * This is executed from the parent task context, so inherit
2958 * counters that have been marked for cloning:
2959 */
2960
2961 if (likely(!parent_ctx->nr_counters))
2962 return;
2963
2964 /*
2965 * Lock the parent list. No need to lock the child - not PID
2966 * hashed yet and not running, so nobody can access it.
2967 */
2968 mutex_lock(&parent_ctx->mutex);
2969
2970 /*
2971 * We dont have to disable NMIs - we are only looking at
2972 * the list, not manipulating it:
2973 */
2974 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2975 if (!counter->hw_event.inherit)
2976 continue;
2977
2978 if (inherit_group(counter, parent,
2979 parent_ctx, child, child_ctx))
2980 break;
2981 }
2982
2983 mutex_unlock(&parent_ctx->mutex);
2984}
2985
2986static void __cpuinit perf_counter_init_cpu(int cpu)
2987{
2988 struct perf_cpu_context *cpuctx;
2989
2990 cpuctx = &per_cpu(perf_cpu_context, cpu);
2991 __perf_counter_init_context(&cpuctx->ctx, NULL);
2992
2993 mutex_lock(&perf_resource_mutex);
2994 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2995 mutex_unlock(&perf_resource_mutex);
2996
2997 hw_perf_counter_setup(cpu);
2998}
2999
3000#ifdef CONFIG_HOTPLUG_CPU
3001static void __perf_counter_exit_cpu(void *info)
3002{
3003 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3004 struct perf_counter_context *ctx = &cpuctx->ctx;
3005 struct perf_counter *counter, *tmp;
3006
3007 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3008 __perf_counter_remove_from_context(counter);
3009}
3010static void perf_counter_exit_cpu(int cpu)
3011{
3012 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3013 struct perf_counter_context *ctx = &cpuctx->ctx;
3014
3015 mutex_lock(&ctx->mutex);
3016 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3017 mutex_unlock(&ctx->mutex);
3018}
3019#else
3020static inline void perf_counter_exit_cpu(int cpu) { }
3021#endif
3022
3023static int __cpuinit
3024perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3025{
3026 unsigned int cpu = (long)hcpu;
3027
3028 switch (action) {
3029
3030 case CPU_UP_PREPARE:
3031 case CPU_UP_PREPARE_FROZEN:
3032 perf_counter_init_cpu(cpu);
3033 break;
3034
3035 case CPU_DOWN_PREPARE:
3036 case CPU_DOWN_PREPARE_FROZEN:
3037 perf_counter_exit_cpu(cpu);
3038 break;
3039
3040 default:
3041 break;
3042 }
3043
3044 return NOTIFY_OK;
3045}
3046
3047static struct notifier_block __cpuinitdata perf_cpu_nb = {
3048 .notifier_call = perf_cpu_notify,
3049};
3050
3051static int __init perf_counter_init(void)
3052{
3053 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3054 (void *)(long)smp_processor_id());
3055 register_cpu_notifier(&perf_cpu_nb);
3056
3057 return 0;
3058}
3059early_initcall(perf_counter_init);
3060
3061static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3062{
3063 return sprintf(buf, "%d\n", perf_reserved_percpu);
3064}
3065
3066static ssize_t
3067perf_set_reserve_percpu(struct sysdev_class *class,
3068 const char *buf,
3069 size_t count)
3070{
3071 struct perf_cpu_context *cpuctx;
3072 unsigned long val;
3073 int err, cpu, mpt;
3074
3075 err = strict_strtoul(buf, 10, &val);
3076 if (err)
3077 return err;
3078 if (val > perf_max_counters)
3079 return -EINVAL;
3080
3081 mutex_lock(&perf_resource_mutex);
3082 perf_reserved_percpu = val;
3083 for_each_online_cpu(cpu) {
3084 cpuctx = &per_cpu(perf_cpu_context, cpu);
3085 spin_lock_irq(&cpuctx->ctx.lock);
3086 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3087 perf_max_counters - perf_reserved_percpu);
3088 cpuctx->max_pertask = mpt;
3089 spin_unlock_irq(&cpuctx->ctx.lock);
3090 }
3091 mutex_unlock(&perf_resource_mutex);
3092
3093 return count;
3094}
3095
3096static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3097{
3098 return sprintf(buf, "%d\n", perf_overcommit);
3099}
3100
3101static ssize_t
3102perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3103{
3104 unsigned long val;
3105 int err;
3106
3107 err = strict_strtoul(buf, 10, &val);
3108 if (err)
3109 return err;
3110 if (val > 1)
3111 return -EINVAL;
3112
3113 mutex_lock(&perf_resource_mutex);
3114 perf_overcommit = val;
3115 mutex_unlock(&perf_resource_mutex);
3116
3117 return count;
3118}
3119
3120static SYSDEV_CLASS_ATTR(
3121 reserve_percpu,
3122 0644,
3123 perf_show_reserve_percpu,
3124 perf_set_reserve_percpu
3125 );
3126
3127static SYSDEV_CLASS_ATTR(
3128 overcommit,
3129 0644,
3130 perf_show_overcommit,
3131 perf_set_overcommit
3132 );
3133
3134static struct attribute *perfclass_attrs[] = {
3135 &attr_reserve_percpu.attr,
3136 &attr_overcommit.attr,
3137 NULL
3138};
3139
3140static struct attribute_group perfclass_attr_group = {
3141 .attrs = perfclass_attrs,
3142 .name = "perf_counters",
3143};
3144
3145static int __init perf_counter_sysfs_init(void)
3146{
3147 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3148 &perfclass_attr_group);
3149}
3150device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d5072..b66a08c2480e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -584,6 +584,7 @@ struct rq {
584 struct load_weight load; 584 struct load_weight load;
585 unsigned long nr_load_updates; 585 unsigned long nr_load_updates;
586 u64 nr_switches; 586 u64 nr_switches;
587 u64 nr_migrations_in;
587 588
588 struct cfs_rq cfs; 589 struct cfs_rq cfs;
589 struct rt_rq rt; 590 struct rt_rq rt;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1955,12 +1956,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1955 p->se.sleep_start -= clock_offset; 1956 p->se.sleep_start -= clock_offset;
1956 if (p->se.block_start) 1957 if (p->se.block_start)
1957 p->se.block_start -= clock_offset; 1958 p->se.block_start -= clock_offset;
1959#endif
1958 if (old_cpu != new_cpu) { 1960 if (old_cpu != new_cpu) {
1959 schedstat_inc(p, se.nr_migrations); 1961 p->se.nr_migrations++;
1962 new_rq->nr_migrations_in++;
1963#ifdef CONFIG_SCHEDSTATS
1960 if (task_hot(p, old_rq->clock, NULL)) 1964 if (task_hot(p, old_rq->clock, NULL))
1961 schedstat_inc(p, se.nr_forced2_migrations); 1965 schedstat_inc(p, se.nr_forced2_migrations);
1962 }
1963#endif 1966#endif
1967 }
1964 p->se.vruntime -= old_cfsrq->min_vruntime - 1968 p->se.vruntime -= old_cfsrq->min_vruntime -
1965 new_cfsrq->min_vruntime; 1969 new_cfsrq->min_vruntime;
1966 1970
@@ -2312,6 +2316,27 @@ static int sched_balance_self(int cpu, int flag)
2312 2316
2313#endif /* CONFIG_SMP */ 2317#endif /* CONFIG_SMP */
2314 2318
2319/**
2320 * task_oncpu_function_call - call a function on the cpu on which a task runs
2321 * @p: the task to evaluate
2322 * @func: the function to be called
2323 * @info: the function call argument
2324 *
2325 * Calls the function @func when the task is currently running. This might
2326 * be on the current CPU, which just calls the function directly
2327 */
2328void task_oncpu_function_call(struct task_struct *p,
2329 void (*func) (void *info), void *info)
2330{
2331 int cpu;
2332
2333 preempt_disable();
2334 cpu = task_cpu(p);
2335 if (task_curr(p))
2336 smp_call_function_single(cpu, func, info, 1);
2337 preempt_enable();
2338}
2339
2315/*** 2340/***
2316 * try_to_wake_up - wake up a thread 2341 * try_to_wake_up - wake up a thread
2317 * @p: the to-be-woken-up thread 2342 * @p: the to-be-woken-up thread
@@ -2468,6 +2493,7 @@ static void __sched_fork(struct task_struct *p)
2468 p->se.exec_start = 0; 2493 p->se.exec_start = 0;
2469 p->se.sum_exec_runtime = 0; 2494 p->se.sum_exec_runtime = 0;
2470 p->se.prev_sum_exec_runtime = 0; 2495 p->se.prev_sum_exec_runtime = 0;
2496 p->se.nr_migrations = 0;
2471 p->se.last_wakeup = 0; 2497 p->se.last_wakeup = 0;
2472 p->se.avg_overlap = 0; 2498 p->se.avg_overlap = 0;
2473 p->se.start_runtime = 0; 2499 p->se.start_runtime = 0;
@@ -2698,6 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2698 */ 2724 */
2699 prev_state = prev->state; 2725 prev_state = prev->state;
2700 finish_arch_switch(prev); 2726 finish_arch_switch(prev);
2727 perf_counter_task_sched_in(current, cpu_of(rq));
2701 finish_lock_switch(rq, prev); 2728 finish_lock_switch(rq, prev);
2702#ifdef CONFIG_SMP 2729#ifdef CONFIG_SMP
2703 if (post_schedule) 2730 if (post_schedule)
@@ -2860,6 +2887,15 @@ unsigned long nr_active(void)
2860} 2887}
2861 2888
2862/* 2889/*
2890 * Externally visible per-cpu scheduler statistics:
2891 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2892 */
2893u64 cpu_nr_migrations(int cpu)
2894{
2895 return cpu_rq(cpu)->nr_migrations_in;
2896}
2897
2898/*
2863 * Update rq->cpu_load[] statistics. This function is usually called every 2899 * Update rq->cpu_load[] statistics. This function is usually called every
2864 * scheduler tick (TICK_NSEC). 2900 * scheduler tick (TICK_NSEC).
2865 */ 2901 */
@@ -4514,6 +4550,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
4514 * Return any ns on the sched_clock that have not yet been banked in 4550 * Return any ns on the sched_clock that have not yet been banked in
4515 * @p in case that task is currently running. 4551 * @p in case that task is currently running.
4516 */ 4552 */
4553unsigned long long __task_delta_exec(struct task_struct *p, int update)
4554{
4555 s64 delta_exec;
4556 struct rq *rq;
4557
4558 rq = task_rq(p);
4559 WARN_ON_ONCE(!runqueue_is_locked());
4560 WARN_ON_ONCE(!task_current(rq, p));
4561
4562 if (update)
4563 update_rq_clock(rq);
4564
4565 delta_exec = rq->clock - p->se.exec_start;
4566
4567 WARN_ON_ONCE(delta_exec < 0);
4568
4569 return delta_exec;
4570}
4571
4572/*
4573 * Return any ns on the sched_clock that have not yet been banked in
4574 * @p in case that task is currently running.
4575 */
4517unsigned long long task_delta_exec(struct task_struct *p) 4576unsigned long long task_delta_exec(struct task_struct *p)
4518{ 4577{
4519 unsigned long flags; 4578 unsigned long flags;
@@ -4773,6 +4832,7 @@ void scheduler_tick(void)
4773 update_rq_clock(rq); 4832 update_rq_clock(rq);
4774 update_cpu_load(rq); 4833 update_cpu_load(rq);
4775 curr->sched_class->task_tick(rq, curr, 0); 4834 curr->sched_class->task_tick(rq, curr, 0);
4835 perf_counter_task_tick(curr, cpu);
4776 spin_unlock(&rq->lock); 4836 spin_unlock(&rq->lock);
4777 4837
4778#ifdef CONFIG_SMP 4838#ifdef CONFIG_SMP
@@ -4988,6 +5048,7 @@ need_resched_nonpreemptible:
4988 5048
4989 if (likely(prev != next)) { 5049 if (likely(prev != next)) {
4990 sched_info_switch(prev, next); 5050 sched_info_switch(prev, next);
5051 perf_counter_task_sched_out(prev, cpu);
4991 5052
4992 rq->nr_switches++; 5053 rq->nr_switches++;
4993 rq->curr = next; 5054 rq->curr = next;
diff --git a/kernel/sys.c b/kernel/sys.c
index 51dbb55604e8..14c4c5613118 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1799,6 +1800,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1799 case PR_SET_TSC: 1800 case PR_SET_TSC:
1800 error = SET_TSC_CTL(arg2); 1801 error = SET_TSC_CTL(arg2);
1801 break; 1802 break;
1803 case PR_TASK_PERF_COUNTERS_DISABLE:
1804 error = perf_counter_task_disable();
1805 break;
1806 case PR_TASK_PERF_COUNTERS_ENABLE:
1807 error = perf_counter_task_enable();
1808 break;
1802 case PR_GET_TIMERSLACK: 1809 case PR_GET_TIMERSLACK:
1803 error = current->timer_slack_ns; 1810 error = current->timer_slack_ns;
1804 break; 1811 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..672ca25fbc43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
1167{ 1168{
1168 struct tvec_base *base = __get_cpu_var(tvec_bases); 1169 struct tvec_base *base = __get_cpu_var(tvec_bases);
1169 1170
1171 perf_counter_do_pending();
1172
1170 hrtimer_run_pending(); 1173 hrtimer_run_pending();
1171 1174
1172 if (time_after_eq(jiffies, base->timer_jiffies)) 1175 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c11..1df63f614f97 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ munmap_back:
1223 if (correct_wcount) 1224 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1225 atomic_inc(&inode->i_writecount);
1225out: 1226out:
1227 if (vm_flags & VM_EXEC)
1228 perf_counter_mmap(addr, len, pgoff, file);
1229
1226 mm->total_vm += len >> PAGE_SHIFT; 1230 mm->total_vm += len >> PAGE_SHIFT;
1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1231 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1228 if (vm_flags & VM_LOCKED) { 1232 if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1756 do { 1760 do {
1757 long nrpages = vma_pages(vma); 1761 long nrpages = vma_pages(vma);
1758 1762
1763 if (vma->vm_flags & VM_EXEC) {
1764 perf_counter_munmap(vma->vm_start,
1765 nrpages << PAGE_SHIFT,
1766 vma->vm_pgoff, vma->vm_file);
1767 }
1768
1759 mm->total_vm -= nrpages; 1769 mm->total_vm -= nrpages;
1760 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1770 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1761 vma = remove_vma(vma); 1771 vma = remove_vma(vma);