aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf-counters.txt147
-rw-r--r--arch/powerpc/include/asm/hw_irq.h31
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h72
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h3
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c10
-rw-r--r--arch/powerpc/kernel/perf_counter.c785
-rw-r--r--arch/powerpc/kernel/power6-pmu.c283
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c375
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h218
-rw-r--r--arch/x86/include/asm/hardirq_32.h1
-rw-r--r--arch/x86/include/asm/hardirq_64.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h5
-rw-r--r--arch/x86/include/asm/perf_counter.h95
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c695
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/oprofile/op_model_ppro.c2
-rw-r--r--drivers/acpi/processor_idle.c8
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c8
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/perf_counter.h290
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--init/Kconfig30
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/perf_counter.c2169
-rw-r--r--kernel/sched.c76
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
56 files changed, 5430 insertions, 51 deletions
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 000000000000..fddd32189a50
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,147 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those.
15
16Performance counters are accessed via special file descriptors.
17There's one file descriptor per virtual counter used.
18
19The special file descriptor is opened via the perf_counter_open()
20system call:
21
22 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
23 pid_t pid, int cpu, int group_fd);
24
25The syscall returns the new fd. The fd can be used via the normal
26VFS system calls: read() can be used to read the counter, fcntl()
27can be used to set the blocking mode, etc.
28
29Multiple counters can be kept open at a time, and the counters
30can be poll()ed.
31
32When creating a new counter fd, 'perf_counter_hw_event' is:
33
34/*
35 * Hardware event to monitor via a performance monitoring counter:
36 */
37struct perf_counter_hw_event {
38 s64 type;
39
40 u64 irq_period;
41 u32 record_type;
42
43 u32 disabled : 1, /* off by default */
44 nmi : 1, /* NMI sampling */
45 raw : 1, /* raw event type */
46 __reserved_1 : 29;
47
48 u64 __reserved_2;
49};
50
51/*
52 * Generalized performance counter event types, used by the hw_event.type
53 * parameter of the sys_perf_counter_open() syscall:
54 */
55enum hw_event_types {
56 /*
57 * Common hardware events, generalized by the kernel:
58 */
59 PERF_COUNT_CYCLES = 0,
60 PERF_COUNT_INSTRUCTIONS = 1,
61 PERF_COUNT_CACHE_REFERENCES = 2,
62 PERF_COUNT_CACHE_MISSES = 3,
63 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
64 PERF_COUNT_BRANCH_MISSES = 5,
65
66 /*
67 * Special "software" counters provided by the kernel, even if
68 * the hardware does not support performance counters. These
69 * counters measure various physical and sw events of the
70 * kernel (and allow the profiling of them as well):
71 */
72 PERF_COUNT_CPU_CLOCK = -1,
73 PERF_COUNT_TASK_CLOCK = -2,
74 /*
75 * Future software events:
76 */
77 /* PERF_COUNT_PAGE_FAULTS = -3,
78 PERF_COUNT_CONTEXT_SWITCHES = -4, */
79};
80
81These are standardized types of events that work uniformly on all CPUs
82that implements Performance Counters support under Linux. If a CPU is
83not able to count branch-misses, then the system call will return
84-EINVAL.
85
86More hw_event_types are supported as well, but they are CPU
87specific and are enumerated via /sys on a per CPU basis. Raw hw event
88types can be passed in under hw_event.type if hw_event.raw is 1.
89For example, to count "External bus cycles while bus lock signal asserted"
90events on Intel Core CPUs, pass in a 0x4064 event type value and set
91hw_event.raw to 1.
92
93'record_type' is the type of data that a read() will provide for the
94counter, and it can be one of:
95
96/*
97 * IRQ-notification data record type:
98 */
99enum perf_counter_record_type {
100 PERF_RECORD_SIMPLE = 0,
101 PERF_RECORD_IRQ = 1,
102 PERF_RECORD_GROUP = 2,
103};
104
105a "simple" counter is one that counts hardware events and allows
106them to be read out into a u64 count value. (read() returns 8 on
107a successful read of a simple counter.)
108
109An "irq" counter is one that will also provide an IRQ context information:
110the IP of the interrupted context. In this case read() will return
111the 8-byte counter value, plus the Instruction Pointer address of the
112interrupted context.
113
114The parameter 'hw_event_period' is the number of events before waking up
115a read() that is blocked on a counter fd. Zero value means a non-blocking
116counter.
117
118The 'pid' parameter allows the counter to be specific to a task:
119
120 pid == 0: if the pid parameter is zero, the counter is attached to the
121 current task.
122
123 pid > 0: the counter is attached to a specific task (if the current task
124 has sufficient privilege to do so)
125
126 pid < 0: all tasks are counted (per cpu counters)
127
128The 'cpu' parameter allows a counter to be made specific to a full
129CPU:
130
131 cpu >= 0: the counter is restricted to a specific CPU
132 cpu == -1: the counter counts on all CPUs
133
134(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
135
136A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
137events of that task and 'follows' that task to whatever CPU the task
138gets schedule to. Per task counters can be created by any user, for
139their own tasks.
140
141A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
142all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
143
144Group counters are created by passing in a group_fd of another counter.
145Groups are scheduled at once and can be used with PERF_RECORD_GROUP
146to record multi-dimensional timestamps.
147
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index f75a5fc64d2e..e10f151c3db6 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct hw_interrupt_type; 132struct hw_interrupt_type;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long get_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(int x)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (x),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152extern void perf_counter_do_pending(void);
153
154#else
155
156static inline unsigned long get_perf_counter_pending(void)
157{
158 return 0;
159}
160
161static inline void set_perf_counter_pending(int x) {}
162static inline void perf_counter_do_pending(void) {}
163#endif /* CONFIG_PERF_COUNTERS */
164
134#endif /* __KERNEL__ */ 165#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 166#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..9d7ff6d7fb56
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,72 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15
16/*
17 * This struct provides the constants and functions needed to
18 * describe the PMU on a particular POWER-family CPU.
19 */
20struct power_pmu {
21 int n_counter;
22 int max_alternatives;
23 u64 add_fields;
24 u64 test_adder;
25 int (*compute_mmcr)(unsigned int events[], int n_ev,
26 unsigned int hwc[], u64 mmcr[]);
27 int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
28 int (*get_alternatives)(unsigned int event, unsigned int alt[]);
29 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
30 int n_generic;
31 int *generic_events;
32};
33
34extern struct power_pmu *ppmu;
35
36/*
37 * The power_pmu.get_constraint function returns a 64-bit value and
38 * a 64-bit mask that express the constraints between this event and
39 * other events.
40 *
41 * The value and mask are divided up into (non-overlapping) bitfields
42 * of three different types:
43 *
44 * Select field: this expresses the constraint that some set of bits
45 * in MMCR* needs to be set to a specific value for this event. For a
46 * select field, the mask contains 1s in every bit of the field, and
47 * the value contains a unique value for each possible setting of the
48 * MMCR* bits. The constraint checking code will ensure that two events
49 * that set the same field in their masks have the same value in their
50 * value dwords.
51 *
52 * Add field: this expresses the constraint that there can be at most
53 * N events in a particular class. A field of k bits can be used for
54 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
55 * set (and the other bits 0), and the value has only the least significant
56 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
57 * in the struct power_pmu for this processor come into play. The
58 * add_fields value contains 1 in the LSB of the field, and the
59 * test_adder contains 2^(k-1) - 1 - N in the field.
60 *
61 * NAND field: this expresses the constraint that you may not have events
62 * in all of a set of classes. (For example, on PPC970, you can't select
63 * events from the FPU, ISU and IDU simultaneously, although any two are
64 * possible.) For N classes, the field is N+1 bits wide, and each class
65 * is assigned one bit from the least-significant N bits. The mask has
66 * only the most-significant bit set, and the value has only the bit
67 * for the event's class set. The test_adder has the least significant
68 * bit set in the field.
69 *
70 * If an event is not subject to the constraint expressed by a particular
71 * field, then it will have 0 in both the mask and value for that field.
72 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 803def236654..da300c4d2888 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c76ed77..7cef5afe89d8 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344 345
345#ifdef __KERNEL__ 346#ifdef __KERNEL__
346 347
347#define __NR_syscalls 319 348#define __NR_syscalls 320
348 349
349#define __NR__exit __NR_exit 350#define __NR__exit __NR_exit
350#define NR_syscalls __NR_syscalls 351#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8d1a419df35d..7c941ec3b23e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o
94obj64-$(CONFIG_AUDIT) += compat_audit.o 94obj64-$(CONFIG_AUDIT) += compat_audit.o
95 95
96obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 96obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o
97 98
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 99obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 100
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9937fe44555f..ce3f8f12f731 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -127,6 +127,7 @@ int main(void)
127 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 127 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
128 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 128 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
129 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 129 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
130 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
130 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 131 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
131 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 132 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
132 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 133 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 383ed6eb0085..f30b4e553c53 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ad1e5ac721d8..7f8e6a92c5a1 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable)
104 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); 104 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
105} 105}
106 106
107#ifdef CONFIG_PERF_COUNTERS
108notrace void __weak perf_counter_do_pending(void)
109{
110 set_perf_counter_pending(0);
111}
112#endif
113
107notrace void raw_local_irq_restore(unsigned long en) 114notrace void raw_local_irq_restore(unsigned long en)
108{ 115{
109 /* 116 /*
@@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 142 iseries_handle_interrupts();
136 } 143 }
137 144
145 if (get_perf_counter_pending())
146 perf_counter_do_pending();
147
138 /* 148 /*
139 * if (get_paca()->hard_enabled) return; 149 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 150 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..5b0211348c73
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,785 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19
20struct cpu_hw_counters {
21 int n_counters;
22 int n_percpu;
23 int disabled;
24 int n_added;
25 struct perf_counter *counter[MAX_HWCOUNTERS];
26 unsigned int events[MAX_HWCOUNTERS];
27 u64 mmcr[3];
28 u8 pmcs_enabled;
29};
30DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
31
32struct power_pmu *ppmu;
33
34void perf_counter_print_debug(void)
35{
36}
37
38/*
39 * Read one performance monitor counter (PMC).
40 */
41static unsigned long read_pmc(int idx)
42{
43 unsigned long val;
44
45 switch (idx) {
46 case 1:
47 val = mfspr(SPRN_PMC1);
48 break;
49 case 2:
50 val = mfspr(SPRN_PMC2);
51 break;
52 case 3:
53 val = mfspr(SPRN_PMC3);
54 break;
55 case 4:
56 val = mfspr(SPRN_PMC4);
57 break;
58 case 5:
59 val = mfspr(SPRN_PMC5);
60 break;
61 case 6:
62 val = mfspr(SPRN_PMC6);
63 break;
64 case 7:
65 val = mfspr(SPRN_PMC7);
66 break;
67 case 8:
68 val = mfspr(SPRN_PMC8);
69 break;
70 default:
71 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
72 val = 0;
73 }
74 return val;
75}
76
77/*
78 * Write one PMC.
79 */
80static void write_pmc(int idx, unsigned long val)
81{
82 switch (idx) {
83 case 1:
84 mtspr(SPRN_PMC1, val);
85 break;
86 case 2:
87 mtspr(SPRN_PMC2, val);
88 break;
89 case 3:
90 mtspr(SPRN_PMC3, val);
91 break;
92 case 4:
93 mtspr(SPRN_PMC4, val);
94 break;
95 case 5:
96 mtspr(SPRN_PMC5, val);
97 break;
98 case 6:
99 mtspr(SPRN_PMC6, val);
100 break;
101 case 7:
102 mtspr(SPRN_PMC7, val);
103 break;
104 case 8:
105 mtspr(SPRN_PMC8, val);
106 break;
107 default:
108 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
109 }
110}
111
112/*
113 * Check if a set of events can all go on the PMU at once.
114 * If they can't, this will look at alternative codes for the events
115 * and see if any combination of alternative codes is feasible.
116 * The feasible set is returned in event[].
117 */
118static int power_check_constraints(unsigned int event[], int n_ev)
119{
120 u64 mask, value, nv;
121 unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
122 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
123 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
124 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
125 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
126 int i, j;
127 u64 addf = ppmu->add_fields;
128 u64 tadd = ppmu->test_adder;
129
130 if (n_ev > ppmu->n_counter)
131 return -1;
132
133 /* First see if the events will go on as-is */
134 for (i = 0; i < n_ev; ++i) {
135 alternatives[i][0] = event[i];
136 if (ppmu->get_constraint(event[i], &amasks[i][0],
137 &avalues[i][0]))
138 return -1;
139 choice[i] = 0;
140 }
141 value = mask = 0;
142 for (i = 0; i < n_ev; ++i) {
143 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
144 if ((((nv + tadd) ^ value) & mask) != 0 ||
145 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
146 break;
147 value = nv;
148 mask |= amasks[i][0];
149 }
150 if (i == n_ev)
151 return 0; /* all OK */
152
153 /* doesn't work, gather alternatives... */
154 if (!ppmu->get_alternatives)
155 return -1;
156 for (i = 0; i < n_ev; ++i) {
157 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
158 for (j = 1; j < n_alt[i]; ++j)
159 ppmu->get_constraint(alternatives[i][j],
160 &amasks[i][j], &avalues[i][j]);
161 }
162
163 /* enumerate all possibilities and see if any will work */
164 i = 0;
165 j = -1;
166 value = mask = nv = 0;
167 while (i < n_ev) {
168 if (j >= 0) {
169 /* we're backtracking, restore context */
170 value = svalues[i];
171 mask = smasks[i];
172 j = choice[i];
173 }
174 /*
175 * See if any alternative k for event i,
176 * where k > j, will satisfy the constraints.
177 */
178 while (++j < n_alt[i]) {
179 nv = (value | avalues[i][j]) +
180 (value & avalues[i][j] & addf);
181 if ((((nv + tadd) ^ value) & mask) == 0 &&
182 (((nv + tadd) ^ avalues[i][j])
183 & amasks[i][j]) == 0)
184 break;
185 }
186 if (j >= n_alt[i]) {
187 /*
188 * No feasible alternative, backtrack
189 * to event i-1 and continue enumerating its
190 * alternatives from where we got up to.
191 */
192 if (--i < 0)
193 return -1;
194 } else {
195 /*
196 * Found a feasible alternative for event i,
197 * remember where we got up to with this event,
198 * go on to the next event, and start with
199 * the first alternative for it.
200 */
201 choice[i] = j;
202 svalues[i] = value;
203 smasks[i] = mask;
204 value = nv;
205 mask |= amasks[i][j];
206 ++i;
207 j = -1;
208 }
209 }
210
211 /* OK, we have a feasible combination, tell the caller the solution */
212 for (i = 0; i < n_ev; ++i)
213 event[i] = alternatives[i][choice[i]];
214 return 0;
215}
216
217static void power_perf_read(struct perf_counter *counter)
218{
219 long val, delta, prev;
220
221 if (!counter->hw.idx)
222 return;
223 /*
224 * Performance monitor interrupts come even when interrupts
225 * are soft-disabled, as long as interrupts are hard-enabled.
226 * Therefore we treat them like NMIs.
227 */
228 do {
229 prev = atomic64_read(&counter->hw.prev_count);
230 barrier();
231 val = read_pmc(counter->hw.idx);
232 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
233
234 /* The counters are only 32 bits wide */
235 delta = (val - prev) & 0xfffffffful;
236 atomic64_add(delta, &counter->count);
237 atomic64_sub(delta, &counter->hw.period_left);
238}
239
240/*
241 * Disable all counters to prevent PMU interrupts and to allow
242 * counters to be added or removed.
243 */
244u64 hw_perf_save_disable(void)
245{
246 struct cpu_hw_counters *cpuhw;
247 unsigned long ret;
248 unsigned long flags;
249
250 local_irq_save(flags);
251 cpuhw = &__get_cpu_var(cpu_hw_counters);
252
253 ret = cpuhw->disabled;
254 if (!ret) {
255 cpuhw->disabled = 1;
256 cpuhw->n_added = 0;
257
258 /*
259 * Check if we ever enabled the PMU on this cpu.
260 */
261 if (!cpuhw->pmcs_enabled) {
262 if (ppc_md.enable_pmcs)
263 ppc_md.enable_pmcs();
264 cpuhw->pmcs_enabled = 1;
265 }
266
267 /*
268 * Set the 'freeze counters' bit.
269 * The barrier is to make sure the mtspr has been
270 * executed and the PMU has frozen the counters
271 * before we return.
272 */
273 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
274 mb();
275 }
276 local_irq_restore(flags);
277 return ret;
278}
279
280/*
281 * Re-enable all counters if disable == 0.
282 * If we were previously disabled and counters were added, then
283 * put the new config on the PMU.
284 */
285void hw_perf_restore(u64 disable)
286{
287 struct perf_counter *counter;
288 struct cpu_hw_counters *cpuhw;
289 unsigned long flags;
290 long i;
291 unsigned long val;
292 s64 left;
293 unsigned int hwc_index[MAX_HWCOUNTERS];
294
295 if (disable)
296 return;
297 local_irq_save(flags);
298 cpuhw = &__get_cpu_var(cpu_hw_counters);
299 cpuhw->disabled = 0;
300
301 /*
302 * If we didn't change anything, or only removed counters,
303 * no need to recalculate MMCR* settings and reset the PMCs.
304 * Just reenable the PMU with the current MMCR* settings
305 * (possibly updated for removal of counters).
306 */
307 if (!cpuhw->n_added) {
308 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
309 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
310 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
311 if (cpuhw->n_counters == 0)
312 get_lppaca()->pmcregs_in_use = 0;
313 goto out;
314 }
315
316 /*
317 * Compute MMCR* values for the new set of counters
318 */
319 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
320 cpuhw->mmcr)) {
321 /* shouldn't ever get here */
322 printk(KERN_ERR "oops compute_mmcr failed\n");
323 goto out;
324 }
325
326 /*
327 * Write the new configuration to MMCR* with the freeze
328 * bit set and set the hardware counters to their initial values.
329 * Then unfreeze the counters.
330 */
331 get_lppaca()->pmcregs_in_use = 1;
332 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
333 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
334 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
335 | MMCR0_FC);
336
337 /*
338 * Read off any pre-existing counters that need to move
339 * to another PMC.
340 */
341 for (i = 0; i < cpuhw->n_counters; ++i) {
342 counter = cpuhw->counter[i];
343 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
344 power_perf_read(counter);
345 write_pmc(counter->hw.idx, 0);
346 counter->hw.idx = 0;
347 }
348 }
349
350 /*
351 * Initialize the PMCs for all the new and moved counters.
352 */
353 for (i = 0; i < cpuhw->n_counters; ++i) {
354 counter = cpuhw->counter[i];
355 if (counter->hw.idx)
356 continue;
357 val = 0;
358 if (counter->hw_event.irq_period) {
359 left = atomic64_read(&counter->hw.period_left);
360 if (left < 0x80000000L)
361 val = 0x80000000L - left;
362 }
363 atomic64_set(&counter->hw.prev_count, val);
364 counter->hw.idx = hwc_index[i] + 1;
365 write_pmc(counter->hw.idx, val);
366 }
367 mb();
368 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
369 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
370
371 out:
372 local_irq_restore(flags);
373}
374
375static int collect_events(struct perf_counter *group, int max_count,
376 struct perf_counter *ctrs[], unsigned int *events)
377{
378 int n = 0;
379 struct perf_counter *counter;
380
381 if (!is_software_counter(group)) {
382 if (n >= max_count)
383 return -1;
384 ctrs[n] = group;
385 events[n++] = group->hw.config;
386 }
387 list_for_each_entry(counter, &group->sibling_list, list_entry) {
388 if (!is_software_counter(counter) &&
389 counter->state != PERF_COUNTER_STATE_OFF) {
390 if (n >= max_count)
391 return -1;
392 ctrs[n] = counter;
393 events[n++] = counter->hw.config;
394 }
395 }
396 return n;
397}
398
399static void counter_sched_in(struct perf_counter *counter, int cpu)
400{
401 counter->state = PERF_COUNTER_STATE_ACTIVE;
402 counter->oncpu = cpu;
403 if (is_software_counter(counter))
404 counter->hw_ops->enable(counter);
405}
406
407/*
408 * Called to enable a whole group of counters.
409 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
410 * Assumes the caller has disabled interrupts and has
411 * frozen the PMU with hw_perf_save_disable.
412 */
413int hw_perf_group_sched_in(struct perf_counter *group_leader,
414 struct perf_cpu_context *cpuctx,
415 struct perf_counter_context *ctx, int cpu)
416{
417 struct cpu_hw_counters *cpuhw;
418 long i, n, n0;
419 struct perf_counter *sub;
420
421 cpuhw = &__get_cpu_var(cpu_hw_counters);
422 n0 = cpuhw->n_counters;
423 n = collect_events(group_leader, ppmu->n_counter - n0,
424 &cpuhw->counter[n0], &cpuhw->events[n0]);
425 if (n < 0)
426 return -EAGAIN;
427 if (power_check_constraints(cpuhw->events, n + n0))
428 return -EAGAIN;
429 cpuhw->n_counters = n0 + n;
430 cpuhw->n_added += n;
431
432 /*
433 * OK, this group can go on; update counter states etc.,
434 * and enable any software counters
435 */
436 for (i = n0; i < n0 + n; ++i)
437 cpuhw->counter[i]->hw.config = cpuhw->events[i];
438 cpuctx->active_oncpu += n;
439 n = 1;
440 counter_sched_in(group_leader, cpu);
441 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
442 if (sub->state != PERF_COUNTER_STATE_OFF) {
443 counter_sched_in(sub, cpu);
444 ++n;
445 }
446 }
447 ctx->nr_active += n;
448
449 return 1;
450}
451
452/*
453 * Add a counter to the PMU.
454 * If all counters are not already frozen, then we disable and
455 * re-enable the PMU in order to get hw_perf_restore to do the
456 * actual work of reconfiguring the PMU.
457 */
458static int power_perf_enable(struct perf_counter *counter)
459{
460 struct cpu_hw_counters *cpuhw;
461 unsigned long flags;
462 u64 pmudis;
463 int n0;
464 int ret = -EAGAIN;
465
466 local_irq_save(flags);
467 pmudis = hw_perf_save_disable();
468
469 /*
470 * Add the counter to the list (if there is room)
471 * and check whether the total set is still feasible.
472 */
473 cpuhw = &__get_cpu_var(cpu_hw_counters);
474 n0 = cpuhw->n_counters;
475 if (n0 >= ppmu->n_counter)
476 goto out;
477 cpuhw->counter[n0] = counter;
478 cpuhw->events[n0] = counter->hw.config;
479 if (power_check_constraints(cpuhw->events, n0 + 1))
480 goto out;
481
482 counter->hw.config = cpuhw->events[n0];
483 ++cpuhw->n_counters;
484 ++cpuhw->n_added;
485
486 ret = 0;
487 out:
488 hw_perf_restore(pmudis);
489 local_irq_restore(flags);
490 return ret;
491}
492
493/*
494 * Remove a counter from the PMU.
495 */
496static void power_perf_disable(struct perf_counter *counter)
497{
498 struct cpu_hw_counters *cpuhw;
499 long i;
500 u64 pmudis;
501 unsigned long flags;
502
503 local_irq_save(flags);
504 pmudis = hw_perf_save_disable();
505
506 power_perf_read(counter);
507
508 cpuhw = &__get_cpu_var(cpu_hw_counters);
509 for (i = 0; i < cpuhw->n_counters; ++i) {
510 if (counter == cpuhw->counter[i]) {
511 while (++i < cpuhw->n_counters)
512 cpuhw->counter[i-1] = cpuhw->counter[i];
513 --cpuhw->n_counters;
514 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
515 write_pmc(counter->hw.idx, 0);
516 counter->hw.idx = 0;
517 break;
518 }
519 }
520 if (cpuhw->n_counters == 0) {
521 /* disable exceptions if no counters are running */
522 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
523 }
524
525 hw_perf_restore(pmudis);
526 local_irq_restore(flags);
527}
528
529struct hw_perf_counter_ops power_perf_ops = {
530 .enable = power_perf_enable,
531 .disable = power_perf_disable,
532 .read = power_perf_read
533};
534
535const struct hw_perf_counter_ops *
536hw_perf_counter_init(struct perf_counter *counter)
537{
538 unsigned long ev;
539 struct perf_counter *ctrs[MAX_HWCOUNTERS];
540 unsigned int events[MAX_HWCOUNTERS];
541 int n;
542
543 if (!ppmu)
544 return NULL;
545 if ((s64)counter->hw_event.irq_period < 0)
546 return NULL;
547 ev = counter->hw_event.type;
548 if (!counter->hw_event.raw) {
549 if (ev >= ppmu->n_generic ||
550 ppmu->generic_events[ev] == 0)
551 return NULL;
552 ev = ppmu->generic_events[ev];
553 }
554 counter->hw.config_base = ev;
555 counter->hw.idx = 0;
556
557 /*
558 * If this is in a group, check if it can go on with all the
559 * other hardware counters in the group. We assume the counter
560 * hasn't been linked into its leader's sibling list at this point.
561 */
562 n = 0;
563 if (counter->group_leader != counter) {
564 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
565 ctrs, events);
566 if (n < 0)
567 return NULL;
568 }
569 events[n++] = ev;
570 if (power_check_constraints(events, n))
571 return NULL;
572
573 counter->hw.config = events[n - 1];
574 atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
575 return &power_perf_ops;
576}
577
578/*
579 * Handle wakeups.
580 */
581void perf_counter_do_pending(void)
582{
583 int i;
584 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
585 struct perf_counter *counter;
586
587 set_perf_counter_pending(0);
588 for (i = 0; i < cpuhw->n_counters; ++i) {
589 counter = cpuhw->counter[i];
590 if (counter && counter->wakeup_pending) {
591 counter->wakeup_pending = 0;
592 wake_up(&counter->waitq);
593 }
594 }
595}
596
597/*
598 * Record data for an irq counter.
599 * This function was lifted from the x86 code; maybe it should
600 * go in the core?
601 */
602static void perf_store_irq_data(struct perf_counter *counter, u64 data)
603{
604 struct perf_data *irqdata = counter->irqdata;
605
606 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
607 irqdata->overrun++;
608 } else {
609 u64 *p = (u64 *) &irqdata->data[irqdata->len];
610
611 *p = data;
612 irqdata->len += sizeof(u64);
613 }
614}
615
616/*
617 * Record all the values of the counters in a group
618 */
619static void perf_handle_group(struct perf_counter *counter)
620{
621 struct perf_counter *leader, *sub;
622
623 leader = counter->group_leader;
624 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
625 if (sub != counter)
626 sub->hw_ops->read(sub);
627 perf_store_irq_data(counter, sub->hw_event.type);
628 perf_store_irq_data(counter, atomic64_read(&sub->count));
629 }
630}
631
632/*
633 * A counter has overflowed; update its count and record
634 * things if requested. Note that interrupts are hard-disabled
635 * here so there is no possibility of being interrupted.
636 */
637static void record_and_restart(struct perf_counter *counter, long val,
638 struct pt_regs *regs)
639{
640 s64 prev, delta, left;
641 int record = 0;
642
643 /* we don't have to worry about interrupts here */
644 prev = atomic64_read(&counter->hw.prev_count);
645 delta = (val - prev) & 0xfffffffful;
646 atomic64_add(delta, &counter->count);
647
648 /*
649 * See if the total period for this counter has expired,
650 * and update for the next period.
651 */
652 val = 0;
653 left = atomic64_read(&counter->hw.period_left) - delta;
654 if (counter->hw_event.irq_period) {
655 if (left <= 0) {
656 left += counter->hw_event.irq_period;
657 if (left <= 0)
658 left = counter->hw_event.irq_period;
659 record = 1;
660 }
661 if (left < 0x80000000L)
662 val = 0x80000000L - left;
663 }
664 write_pmc(counter->hw.idx, val);
665 atomic64_set(&counter->hw.prev_count, val);
666 atomic64_set(&counter->hw.period_left, left);
667
668 /*
669 * Finally record data if requested.
670 */
671 if (record) {
672 switch (counter->hw_event.record_type) {
673 case PERF_RECORD_SIMPLE:
674 break;
675 case PERF_RECORD_IRQ:
676 perf_store_irq_data(counter, instruction_pointer(regs));
677 counter->wakeup_pending = 1;
678 break;
679 case PERF_RECORD_GROUP:
680 perf_handle_group(counter);
681 counter->wakeup_pending = 1;
682 break;
683 }
684 }
685}
686
687/*
688 * Performance monitor interrupt stuff
689 */
690static void perf_counter_interrupt(struct pt_regs *regs)
691{
692 int i;
693 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
694 struct perf_counter *counter;
695 long val;
696 int need_wakeup = 0, found = 0;
697
698 for (i = 0; i < cpuhw->n_counters; ++i) {
699 counter = cpuhw->counter[i];
700 val = read_pmc(counter->hw.idx);
701 if ((int)val < 0) {
702 /* counter has overflowed */
703 found = 1;
704 record_and_restart(counter, val, regs);
705 if (counter->wakeup_pending)
706 need_wakeup = 1;
707 }
708 }
709
710 /*
711 * In case we didn't find and reset the counter that caused
712 * the interrupt, scan all counters and reset any that are
713 * negative, to avoid getting continual interrupts.
714 * Any that we processed in the previous loop will not be negative.
715 */
716 if (!found) {
717 for (i = 0; i < ppmu->n_counter; ++i) {
718 val = read_pmc(i + 1);
719 if ((int)val < 0)
720 write_pmc(i + 1, 0);
721 }
722 }
723
724 /*
725 * Reset MMCR0 to its normal value. This will set PMXE and
726 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
727 * and thus allow interrupts to occur again.
728 * XXX might want to use MSR.PM to keep the counters frozen until
729 * we get back out of this interrupt.
730 */
731 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
732
733 /*
734 * If we need a wakeup, check whether interrupts were soft-enabled
735 * when we took the interrupt. If they were, we can wake stuff up
736 * immediately; otherwise we'll have to set a flag and do the
737 * wakeup when interrupts get soft-enabled.
738 */
739 if (need_wakeup) {
740 if (regs->softe) {
741 irq_enter();
742 perf_counter_do_pending();
743 irq_exit();
744 } else {
745 set_perf_counter_pending(1);
746 }
747 }
748}
749
750void hw_perf_counter_setup(int cpu)
751{
752 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
753
754 memset(cpuhw, 0, sizeof(*cpuhw));
755 cpuhw->mmcr[0] = MMCR0_FC;
756}
757
758extern struct power_pmu ppc970_pmu;
759extern struct power_pmu power6_pmu;
760
761static int init_perf_counters(void)
762{
763 unsigned long pvr;
764
765 if (reserve_pmc_hardware(perf_counter_interrupt)) {
766 printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
767 return -EBUSY;
768 }
769
770 /* XXX should get this from cputable */
771 pvr = mfspr(SPRN_PVR);
772 switch (PVR_VER(pvr)) {
773 case PV_970:
774 case PV_970FX:
775 case PV_970MP:
776 ppmu = &ppc970_pmu;
777 break;
778 case 0x3e:
779 ppmu = &power6_pmu;
780 break;
781 }
782 return 0;
783}
784
785arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..b1f61f3c97bb
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Assign PMC numbers and compute MMCR1 value for a set of events
53 */
54static int p6_compute_mmcr(unsigned int event[], int n_ev,
55 unsigned int hwc[], u64 mmcr[])
56{
57 u64 mmcr1 = 0;
58 int i;
59 unsigned int pmc, ev, b, u, s, psel;
60 unsigned int ttmset = 0;
61 unsigned int pmc_inuse = 0;
62
63 if (n_ev > 4)
64 return -1;
65 for (i = 0; i < n_ev; ++i) {
66 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
67 if (pmc) {
68 if (pmc_inuse & (1 << (pmc - 1)))
69 return -1; /* collision! */
70 pmc_inuse |= 1 << (pmc - 1);
71 }
72 }
73 for (i = 0; i < n_ev; ++i) {
74 ev = event[i];
75 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
76 if (pmc) {
77 --pmc;
78 } else {
79 /* can go on any PMC; find a free one */
80 for (pmc = 0; pmc < 4; ++pmc)
81 if (!(pmc_inuse & (1 << pmc)))
82 break;
83 pmc_inuse |= 1 << pmc;
84 }
85 hwc[i] = pmc;
86 psel = ev & PM_PMCSEL_MSK;
87 if (ev & PM_BUSEVENT_MSK) {
88 /* this event uses the event bus */
89 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
90 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
91 /* check for conflict on this byte of event bus */
92 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
93 return -1;
94 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
95 ttmset |= 1 << b;
96 if (u == 5) {
97 /* Nest events have a further mux */
98 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
99 if ((ttmset & 0x10) &&
100 MMCR1_NESTSEL(mmcr1) != s)
101 return -1;
102 ttmset |= 0x10;
103 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
104 }
105 if (0x30 <= psel && psel <= 0x3d) {
106 /* these need the PMCx_ADDR_SEL bits */
107 if (b >= 2)
108 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
109 }
110 /* bus select values are different for PMC3/4 */
111 if (pmc >= 2 && (psel & 0x90) == 0x80)
112 psel ^= 0x20;
113 }
114 if (ev & PM_LLA) {
115 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
116 if (ev & PM_LLAV)
117 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
118 }
119 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
120 }
121 mmcr[0] = 0;
122 if (pmc_inuse & 1)
123 mmcr[0] = MMCR0_PMC1CE;
124 if (pmc_inuse & 0xe)
125 mmcr[0] |= MMCR0_PMCjCE;
126 mmcr[1] = mmcr1;
127 mmcr[2] = 0;
128 return 0;
129}
130
131/*
132 * Layout of constraint bits:
133 *
134 * 0-1 add field: number of uses of PMC1 (max 1)
135 * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4
136 * 8-10 select field: nest (subunit) event selector
137 * 16-19 select field: unit on byte 0 of event bus
138 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
139 */
140static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
141{
142 int pmc, byte, sh;
143 unsigned int mask = 0, value = 0;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 4)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 }
153 if (event & PM_BUSEVENT_MSK) {
154 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
155 sh = byte * 4;
156 mask |= PM_UNIT_MSKS << sh;
157 value |= (event & PM_UNIT_MSKS) << sh;
158 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
159 mask |= PM_SUBUNIT_MSKS;
160 value |= event & PM_SUBUNIT_MSKS;
161 }
162 }
163 *maskp = mask;
164 *valp = value;
165 return 0;
166}
167
168#define MAX_ALT 4 /* at most 4 alternatives for any event */
169
170static const unsigned int event_alternatives[][MAX_ALT] = {
171 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
172 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
173 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
174 { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */
175 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
176 { 0x10000e, 0x400010 }, /* PM_PURR */
177 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
178 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
179 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
180 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
181 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
182 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
183 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
184 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
185 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
186 { 0x200012, 0x300012 }, /* PM_INST_DISP */
187 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
188 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
189 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
190 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
191 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
192 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
193 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
194};
195
196/*
197 * This could be made more efficient with a binary search on
198 * a presorted list, if necessary
199 */
200static int find_alternatives_list(unsigned int event)
201{
202 int i, j;
203 unsigned int alt;
204
205 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
206 if (event < event_alternatives[i][0])
207 return -1;
208 for (j = 0; j < MAX_ALT; ++j) {
209 alt = event_alternatives[i][j];
210 if (!alt || event < alt)
211 break;
212 if (event == alt)
213 return i;
214 }
215 }
216 return -1;
217}
218
219static int p6_get_alternatives(unsigned int event, unsigned int alt[])
220{
221 int i, j;
222 unsigned int aevent, psel, pmc;
223 unsigned int nalt = 1;
224
225 alt[0] = event;
226
227 /* check the alternatives table */
228 i = find_alternatives_list(event);
229 if (i >= 0) {
230 /* copy out alternatives from list */
231 for (j = 0; j < MAX_ALT; ++j) {
232 aevent = event_alternatives[i][j];
233 if (!aevent)
234 break;
235 if (aevent != event)
236 alt[nalt++] = aevent;
237 }
238
239 } else {
240 /* Check for alternative ways of computing sum events */
241 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
242 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
243 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
244 if (pmc && (psel == 0x32 || psel == 0x34))
245 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
246 ((5 - pmc) << PM_PMC_SH);
247
248 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
249 if (pmc && (psel == 0x38 || psel == 0x3a))
250 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
251 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
252 }
253
254 return nalt;
255}
256
257static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
258{
259 /* Set PMCxSEL to 0 to disable PMCx */
260 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
261}
262
263static int power6_generic_events[] = {
264 [PERF_COUNT_CPU_CYCLES] = 0x1e,
265 [PERF_COUNT_INSTRUCTIONS] = 2,
266 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
267 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
268 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
269 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
270};
271
272struct power_pmu power6_pmu = {
273 .n_counter = 4,
274 .max_alternatives = MAX_ALT,
275 .add_fields = 0x55,
276 .test_adder = 0,
277 .compute_mmcr = p6_compute_mmcr,
278 .get_constraint = p6_get_constraint,
279 .get_alternatives = p6_get_alternatives,
280 .disable_pmc = p6_disable_pmc,
281 .n_generic = ARRAY_SIZE(power6_generic_events),
282 .generic_events = power6_generic_events,
283};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..c3256580be1a
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_BYTE_SH 4 /* Byte number of event bus to use */
23#define PM_BYTE_MSK 3
24#define PM_PMCSEL_MSK 0xf
25
26/* Values in PM_UNIT field */
27#define PM_NONE 0
28#define PM_FPU 1
29#define PM_VPU 2
30#define PM_ISU 3
31#define PM_IFU 4
32#define PM_IDU 5
33#define PM_STS 6
34#define PM_LSU0 7
35#define PM_LSU1U 8
36#define PM_LSU1L 9
37#define PM_LASTUNIT 9
38
39/*
40 * Bits in MMCR0 for PPC970
41 */
42#define MMCR0_PMC1SEL_SH 8
43#define MMCR0_PMC2SEL_SH 1
44#define MMCR_PMCSEL_MSK 0x1f
45
46/*
47 * Bits in MMCR1 for PPC970
48 */
49#define MMCR1_TTM0SEL_SH 62
50#define MMCR1_TTM1SEL_SH 59
51#define MMCR1_TTM3SEL_SH 53
52#define MMCR1_TTMSEL_MSK 3
53#define MMCR1_TD_CP_DBG0SEL_SH 50
54#define MMCR1_TD_CP_DBG1SEL_SH 48
55#define MMCR1_TD_CP_DBG2SEL_SH 46
56#define MMCR1_TD_CP_DBG3SEL_SH 44
57#define MMCR1_PMC1_ADDER_SEL_SH 39
58#define MMCR1_PMC2_ADDER_SEL_SH 38
59#define MMCR1_PMC6_ADDER_SEL_SH 37
60#define MMCR1_PMC5_ADDER_SEL_SH 36
61#define MMCR1_PMC8_ADDER_SEL_SH 35
62#define MMCR1_PMC7_ADDER_SEL_SH 34
63#define MMCR1_PMC3_ADDER_SEL_SH 33
64#define MMCR1_PMC4_ADDER_SEL_SH 32
65#define MMCR1_PMC3SEL_SH 27
66#define MMCR1_PMC4SEL_SH 22
67#define MMCR1_PMC5SEL_SH 17
68#define MMCR1_PMC6SEL_SH 12
69#define MMCR1_PMC7SEL_SH 7
70#define MMCR1_PMC8SEL_SH 2
71
72static short mmcr1_adder_bits[8] = {
73 MMCR1_PMC1_ADDER_SEL_SH,
74 MMCR1_PMC2_ADDER_SEL_SH,
75 MMCR1_PMC3_ADDER_SEL_SH,
76 MMCR1_PMC4_ADDER_SEL_SH,
77 MMCR1_PMC5_ADDER_SEL_SH,
78 MMCR1_PMC6_ADDER_SEL_SH,
79 MMCR1_PMC7_ADDER_SEL_SH,
80 MMCR1_PMC8_ADDER_SEL_SH
81};
82
83/*
84 * Bits in MMCRA
85 */
86
87/*
88 * Layout of constraint bits:
89 * 6666555555555544444444443333333333222222222211111111110000000000
90 * 3210987654321098765432109876543210987654321098765432109876543210
91 * <><>[ >[ >[ >< >< >< >< ><><><><><><><><>
92 * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
93 *
94 * T0 - TTM0 constraint
95 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
96 *
97 * T1 - TTM1 constraint
98 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
99 *
100 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
101 * 43: UC3 error 0x0800_0000_0000
102 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
103 * 41: ISU events needed 0x0200_0000_0000
104 * 40: IDU|STS events needed 0x0100_0000_0000
105 *
106 * PS1
107 * 39: PS1 error 0x0080_0000_0000
108 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
109 *
110 * PS2
111 * 35: PS2 error 0x0008_0000_0000
112 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
113 *
114 * B0
115 * 28-31: Byte 0 event source 0xf000_0000
116 * Encoding as for the event code
117 *
118 * B1, B2, B3
119 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
120 *
121 * P1
122 * 15: P1 error 0x8000
123 * 14-15: Count of events needing PMC1
124 *
125 * P2..P8
126 * 0-13: Count of events needing PMC2..PMC8
127 */
128
129/* Masks and values for using events from the various units */
130static u64 unit_cons[PM_LASTUNIT+1][2] = {
131 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
132 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
133 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
134 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
135 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
136 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
137};
138
139static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
140{
141 int pmc, byte, unit, sh;
142 u64 mask = 0, value = 0;
143 int grp = -1;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 8)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 grp = ((pmc - 1) >> 1) & 1;
153 }
154 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
155 if (unit) {
156 if (unit > PM_LASTUNIT)
157 return -1;
158 mask |= unit_cons[unit][0];
159 value |= unit_cons[unit][1];
160 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
161 /*
162 * Bus events on bytes 0 and 2 can be counted
163 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
164 */
165 if (!pmc)
166 grp = byte & 1;
167 /* Set byte lane select field */
168 mask |= 0xfULL << (28 - 4 * byte);
169 value |= (u64)unit << (28 - 4 * byte);
170 }
171 if (grp == 0) {
172 /* increment PMC1/2/5/6 field */
173 mask |= 0x8000000000ull;
174 value |= 0x1000000000ull;
175 } else if (grp == 1) {
176 /* increment PMC3/4/7/8 field */
177 mask |= 0x800000000ull;
178 value |= 0x100000000ull;
179 }
180 *maskp = mask;
181 *valp = value;
182 return 0;
183}
184
185static int p970_get_alternatives(unsigned int event, unsigned int alt[])
186{
187 alt[0] = event;
188
189 /* 2 alternatives for LSU empty */
190 if (event == 0x2002 || event == 0x3002) {
191 alt[1] = event ^ 0x1000;
192 return 2;
193 }
194
195 return 1;
196}
197
198static int p970_compute_mmcr(unsigned int event[], int n_ev,
199 unsigned int hwc[], u64 mmcr[])
200{
201 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
202 unsigned int pmc, unit, byte, psel;
203 unsigned int ttm, grp;
204 unsigned int pmc_inuse = 0;
205 unsigned int pmc_grp_use[2];
206 unsigned char busbyte[4];
207 unsigned char unituse[16];
208 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
209 unsigned char ttmuse[2];
210 unsigned char pmcsel[8];
211 int i;
212
213 if (n_ev > 8)
214 return -1;
215
216 /* First pass to count resource use */
217 pmc_grp_use[0] = pmc_grp_use[1] = 0;
218 memset(busbyte, 0, sizeof(busbyte));
219 memset(unituse, 0, sizeof(unituse));
220 for (i = 0; i < n_ev; ++i) {
221 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
222 if (pmc) {
223 if (pmc_inuse & (1 << (pmc - 1)))
224 return -1;
225 pmc_inuse |= 1 << (pmc - 1);
226 /* count 1/2/5/6 vs 3/4/7/8 use */
227 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
228 }
229 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
230 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
231 if (unit) {
232 if (unit > PM_LASTUNIT)
233 return -1;
234 if (!pmc)
235 ++pmc_grp_use[byte & 1];
236 if (busbyte[byte] && busbyte[byte] != unit)
237 return -1;
238 busbyte[byte] = unit;
239 unituse[unit] = 1;
240 }
241 }
242 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
243 return -1;
244
245 /*
246 * Assign resources and set multiplexer selects.
247 *
248 * PM_ISU can go either on TTM0 or TTM1, but that's the only
249 * choice we have to deal with.
250 */
251 if (unituse[PM_ISU] &
252 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
253 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
254 /* Set TTM[01]SEL fields. */
255 ttmuse[0] = ttmuse[1] = 0;
256 for (i = PM_FPU; i <= PM_STS; ++i) {
257 if (!unituse[i])
258 continue;
259 ttm = unitmap[i];
260 ++ttmuse[(ttm >> 2) & 1];
261 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
262 }
263 /* Check only one unit per TTMx */
264 if (ttmuse[0] > 1 || ttmuse[1] > 1)
265 return -1;
266
267 /* Set byte lane select fields and TTM3SEL. */
268 for (byte = 0; byte < 4; ++byte) {
269 unit = busbyte[byte];
270 if (!unit)
271 continue;
272 if (unit <= PM_STS)
273 ttm = (unitmap[unit] >> 2) & 1;
274 else if (unit == PM_LSU0)
275 ttm = 2;
276 else {
277 ttm = 3;
278 if (unit == PM_LSU1L && byte >= 2)
279 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
280 }
281 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
282 }
283
284 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
285 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
286 for (i = 0; i < n_ev; ++i) {
287 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
288 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
289 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
290 psel = event[i] & PM_PMCSEL_MSK;
291 if (!pmc) {
292 /* Bus event or any-PMC direct event */
293 if (unit)
294 psel |= 0x10 | ((byte & 2) << 2);
295 else
296 psel |= 8;
297 for (pmc = 0; pmc < 8; ++pmc) {
298 if (pmc_inuse & (1 << pmc))
299 continue;
300 grp = (pmc >> 1) & 1;
301 if (unit) {
302 if (grp == (byte & 1))
303 break;
304 } else if (pmc_grp_use[grp] < 4) {
305 ++pmc_grp_use[grp];
306 break;
307 }
308 }
309 pmc_inuse |= 1 << pmc;
310 } else {
311 /* Direct event */
312 --pmc;
313 if (psel == 0 && (byte & 2))
314 /* add events on higher-numbered bus */
315 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
316 }
317 pmcsel[pmc] = psel;
318 hwc[i] = pmc;
319 }
320 for (pmc = 0; pmc < 2; ++pmc)
321 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
322 for (; pmc < 8; ++pmc)
323 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
324 if (pmc_inuse & 1)
325 mmcr0 |= MMCR0_PMC1CE;
326 if (pmc_inuse & 0xfe)
327 mmcr0 |= MMCR0_PMCjCE;
328
329 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
330
331 /* Return MMCRx values */
332 mmcr[0] = mmcr0;
333 mmcr[1] = mmcr1;
334 mmcr[2] = mmcra;
335 return 0;
336}
337
338static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
339{
340 int shift, i;
341
342 if (pmc <= 1) {
343 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
344 i = 0;
345 } else {
346 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
347 i = 1;
348 }
349 /*
350 * Setting the PMCxSEL field to 0x08 disables PMC x.
351 */
352 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
353}
354
355static int ppc970_generic_events[] = {
356 [PERF_COUNT_CPU_CYCLES] = 7,
357 [PERF_COUNT_INSTRUCTIONS] = 1,
358 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
359 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
360 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
361 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
362};
363
364struct power_pmu ppc970_pmu = {
365 .n_counter = 8,
366 .max_alternatives = 2,
367 .add_fields = 0x001100005555ull,
368 .test_adder = 0x013300000000ull,
369 .compute_mmcr = p970_compute_mmcr,
370 .get_constraint = p970_get_constraint,
371 .get_alternatives = p970_get_alternatives,
372 .disable_pmc = p970_disable_pmc,
373 .n_generic = ARRAY_SIZE(ppc970_generic_events),
374 .generic_events = ppc970_generic_events,
375};
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index e868b5c50723..dc0f3c933518 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..1f4844505765 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -685,6 +685,7 @@ config X86_UP_IOAPIC
685config X86_LOCAL_APIC 685config X86_LOCAL_APIC
686 def_bool y 686 def_bool y
687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
688 select HAVE_PERF_COUNTERS if (!M386 && !M486)
688 689
689config X86_IO_APIC 690config X86_IO_APIC
690 def_bool y 691 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 9c79b2477008..01e7c4c5c7fe 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
823 .quad compat_sys_signalfd4 823 .quad compat_sys_signalfd4
824 .quad sys_eventfd2 824 .quad sys_eventfd2
825 .quad sys_epoll_create1 825 .quad sys_epoll_create1
826 .quad sys_dup3 /* 330 */ 826 .quad sys_dup3 /* 330 */
827 .quad sys_pipe2 827 .quad sys_pipe2
828 .quad sys_inotify_init1 828 .quad sys_inotify_init1
829 .quad sys_perf_counter_open
829ia32_syscall_end: 830ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..977250ed8b89 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_set - set atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 *
298 * Atomically sets the value of @ptr to @new_val.
299 */
300static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
301{
302 unsigned long long old_val;
303
304 do {
305 old_val = atomic_read(ptr);
306 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
307}
308
309/**
310 * atomic64_read - read atomic64 variable
311 * @ptr: pointer to type atomic64_t
312 *
313 * Atomically reads the value of @ptr and returns it.
314 */
315static inline unsigned long long atomic64_read(atomic64_t *ptr)
316{
317 unsigned long long curr_val;
318
319 do {
320 curr_val = __atomic64_read(ptr);
321 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
322
323 return curr_val;
324}
325
326/**
327 * atomic64_add_return - add and return
328 * @delta: integer value to add
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically adds @delta to @ptr and returns @delta + *@ptr
332 */
333static inline unsigned long long
334atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
335{
336 unsigned long long old_val, new_val;
337
338 do {
339 old_val = atomic_read(ptr);
340 new_val = old_val + delta;
341
342 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
343
344 return new_val;
345}
346
347static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
348{
349 return atomic64_add_return(-delta, ptr);
350}
351
352static inline long atomic64_inc_return(atomic64_t *ptr)
353{
354 return atomic64_add_return(1, ptr);
355}
356
357static inline long atomic64_dec_return(atomic64_t *ptr)
358{
359 return atomic64_sub_return(1, ptr);
360}
361
362/**
363 * atomic64_add - add integer to atomic64 variable
364 * @delta: integer value to add
365 * @ptr: pointer to type atomic64_t
366 *
367 * Atomically adds @delta to @ptr.
368 */
369static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
370{
371 atomic64_add_return(delta, ptr);
372}
373
374/**
375 * atomic64_sub - subtract the atomic64 variable
376 * @delta: integer value to subtract
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically subtracts @delta from @ptr.
380 */
381static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
382{
383 atomic64_add(-delta, ptr);
384}
385
386/**
387 * atomic64_sub_and_test - subtract value from variable and test result
388 * @delta: integer value to subtract
389 * @ptr: pointer to type atomic64_t
390 *
391 * Atomically subtracts @delta from @ptr and returns
392 * true if the result is zero, or false for all
393 * other cases.
394 */
395static inline int
396atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
397{
398 unsigned long long old_val = atomic64_sub_return(delta, ptr);
399
400 return old_val == 0;
401}
402
403/**
404 * atomic64_inc - increment atomic64 variable
405 * @ptr: pointer to type atomic64_t
406 *
407 * Atomically increments @ptr by 1.
408 */
409static inline void atomic64_inc(atomic64_t *ptr)
410{
411 atomic64_add(1, ptr);
412}
413
414/**
415 * atomic64_dec - decrement atomic64 variable
416 * @ptr: pointer to type atomic64_t
417 *
418 * Atomically decrements @ptr by 1.
419 */
420static inline void atomic64_dec(atomic64_t *ptr)
421{
422 atomic64_sub(1, ptr);
423}
424
425/**
426 * atomic64_dec_and_test - decrement and test
427 * @ptr: pointer to type atomic64_t
428 *
429 * Atomically decrements @ptr by 1 and
430 * returns true if the result is 0, or false for all other
431 * cases.
432 */
433static inline int atomic64_dec_and_test(atomic64_t *ptr)
434{
435 return atomic64_sub_and_test(1, ptr);
436}
437
438/**
439 * atomic64_inc_and_test - increment and test
440 * @ptr: pointer to type atomic64_t
441 *
442 * Atomically increments @ptr by 1
443 * and returns true if the result is zero, or false for all
444 * other cases.
445 */
446static inline int atomic64_inc_and_test(atomic64_t *ptr)
447{
448 return atomic64_sub_and_test(-1, ptr);
449}
450
451/**
452 * atomic64_add_negative - add and test if negative
453 * @delta: integer value to add
454 * @ptr: pointer to type atomic64_t
455 *
456 * Atomically adds @delta to @ptr and returns true
457 * if the result is negative, or false when
458 * result is greater than or equal to zero.
459 */
460static inline int
461atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
462{
463 long long old_val = atomic64_add_return(delta, ptr);
464
465 return old_val < 0;
466}
467
250#include <asm-generic/atomic.h> 468#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 469#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index d4b5d731073f..7838276bfe51 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
9 unsigned long idle_timestamp; 9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */ 10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */ 11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int apic_perf_irqs; /* arch dependent */
12 unsigned int irq0_irqs; 13 unsigned int irq0_irqs;
13 unsigned int irq_resched_count; 14 unsigned int irq_resched_count;
14 unsigned int irq_call_count; 15 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index a65bab20f6ce..42930b279215 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -9,6 +9,7 @@ typedef struct {
9 unsigned int __softirq_pending; 9 unsigned int __softirq_pending;
10 unsigned int __nmi_count; /* arch dependent */ 10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */ 11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int apic_perf_irqs; /* arch dependent */
12 unsigned int irq0_irqs; 13 unsigned int irq0_irqs;
13 unsigned int irq_resched_count; 14 unsigned int irq_resched_count;
14 unsigned int irq_call_count; 15 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..aa93e53b85ee 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
30/* Interrupt handlers registered during init_IRQ */ 30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void); 31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void); 32extern void error_interrupt(void);
33extern void perf_counter_interrupt(void);
34
33extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
34extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index a16a2ab2b429..1554d0236e03 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
87#define LOCAL_TIMER_VECTOR 0xef 87#define LOCAL_TIMER_VECTOR 0xef
88 88
89/* 89/*
90 * Performance monitoring interrupt vector:
91 */
92#define LOCAL_PERF_VECTOR 0xee
93
94/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we 95 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority 96 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector) 97 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..ad31e5d90e90 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
25 * a much simpler SMP time architecture: 25 * a much simpler SMP time architecture:
26 */ 26 */
27#ifdef CONFIG_X86_LOCAL_APIC 27#ifdef CONFIG_X86_LOCAL_APIC
28
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) 29BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 30BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 31BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31 32
33#ifdef CONFIG_PERF_COUNTERS
34BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
35#endif
36
32#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 38BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif 39#endif
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..2e08ed736647
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(int nmi);
90#else
91static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(int nmi) { }
93#endif
94
95#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b46f8ca007b5..f38488989db7 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
83#define TIF_SECCOMP 8 /* secure computing */ 83#define TIF_SECCOMP 8 /* secure computing */
84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
85#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
85#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 86#define TIF_NOTSC 16 /* TSC is not accessible in userland */
86#define TIF_IA32 17 /* 32bit process */ 87#define TIF_IA32 17 /* 32bit process */
87#define TIF_FORK 18 /* ret_from_fork */ 88#define TIF_FORK 18 /* ret_from_fork */
@@ -104,6 +105,7 @@ struct thread_info {
104#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 105#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
105#define _TIF_SECCOMP (1 << TIF_SECCOMP) 106#define _TIF_SECCOMP (1 << TIF_SECCOMP)
106#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 107#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
108#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
107#define _TIF_NOTSC (1 << TIF_NOTSC) 109#define _TIF_NOTSC (1 << TIF_NOTSC)
108#define _TIF_IA32 (1 << TIF_IA32) 110#define _TIF_IA32 (1 << TIF_IA32)
109#define _TIF_FORK (1 << TIF_FORK) 111#define _TIF_FORK (1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
135 137
136/* Only used for 64 bit */ 138/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \ 139#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 140 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
139 141
140/* flags to check in __switch_to() */ 142/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 143#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..7e47658b0a6f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
338#define __NR_dup3 330 338#define __NR_dup3 330
339#define __NR_pipe2 331 339#define __NR_pipe2 331
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_perf_counter_open 333
341 342
342#ifdef __KERNEL__ 343#ifdef __KERNEL__
343 344
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..53025feaf88d 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
653__SYSCALL(__NR_pipe2, sys_pipe2) 653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294 654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1) 655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656 656#define __NR_perf_counter_open 295
657__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
657 658
658#ifndef __NO_STUBS 659#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR 660#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 485787955834..e9af14f748ea 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -35,6 +35,7 @@
35#include <linux/nmi.h> 35#include <linux/nmi.h>
36#include <linux/timex.h> 36#include <linux/timex.h>
37 37
38#include <asm/perf_counter.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mtrr.h> 40#include <asm/mtrr.h>
40#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -1144,6 +1145,7 @@ void __cpuinit setup_local_APIC(void)
1144 apic_write(APIC_ESR, 0); 1145 apic_write(APIC_ESR, 0);
1145 } 1146 }
1146#endif 1147#endif
1148 perf_counters_lapic_init(0);
1147 1149
1148 preempt_disable(); 1150 preempt_disable();
1149 1151
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..c3813306e0b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -22,11 +22,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 24
25obj-$(CONFIG_X86_MCE) += mcheck/ 25obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
26obj-$(CONFIG_MTRR) += mtrr/
27obj-$(CONFIG_CPU_FREQ) += cpufreq/
28 26
29obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 27obj-$(CONFIG_X86_MCE) += mcheck/
28obj-$(CONFIG_MTRR) += mtrr/
29obj-$(CONFIG_CPU_FREQ) += cpufreq/
30
31obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
30 32
31quiet_cmd_mkcapflags = MKCAP $@ 33quiet_cmd_mkcapflags = MKCAP $@
32 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 34 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7976a6a0f65c..95eb30e1e677 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/mtrr.h> 18#include <asm/mtrr.h>
19#include <asm/mce.h> 19#include <asm/mce.h>
20#include <asm/perf_counter.h>
20#include <asm/pat.h> 21#include <asm/pat.h>
21#include <asm/asm.h> 22#include <asm/asm.h>
22#include <asm/numa.h> 23#include <asm/numa.h>
@@ -774,6 +775,7 @@ void __init identify_boot_cpu(void)
774#else 775#else
775 vgetcpu_set_mode(); 776 vgetcpu_set_mode();
776#endif 777#endif
778 init_hw_perf_counters();
777} 779}
778 780
779void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 781void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..9376771f757b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,695 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/perf_counter.h>
11#include <linux/capability.h>
12#include <linux/notifier.h>
13#include <linux/hardirq.h>
14#include <linux/kprobes.h>
15#include <linux/module.h>
16#include <linux/kdebug.h>
17#include <linux/sched.h>
18
19#include <asm/perf_counter.h>
20#include <asm/apic.h>
21
22static bool perf_counters_initialized __read_mostly;
23
24/*
25 * Number of (generic) HW counters:
26 */
27static int nr_counters_generic __read_mostly;
28static u64 perf_counter_mask __read_mostly;
29static u64 counter_value_mask __read_mostly;
30
31static int nr_counters_fixed __read_mostly;
32
33struct cpu_hw_counters {
34 struct perf_counter *counters[X86_PMC_IDX_MAX];
35 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
36};
37
38/*
39 * Intel PerfMon v3. Used on Core2 and later.
40 */
41static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
42
43static const int intel_perfmon_event_map[] =
44{
45 [PERF_COUNT_CPU_CYCLES] = 0x003c,
46 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
47 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
48 [PERF_COUNT_CACHE_MISSES] = 0x412e,
49 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
50 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
51 [PERF_COUNT_BUS_CYCLES] = 0x013c,
52};
53
54static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
55
56/*
57 * Propagate counter elapsed time into the generic counter.
58 * Can only be executed on the CPU where the counter is active.
59 * Returns the delta events processed.
60 */
61static void
62x86_perf_counter_update(struct perf_counter *counter,
63 struct hw_perf_counter *hwc, int idx)
64{
65 u64 prev_raw_count, new_raw_count, delta;
66
67 /*
68 * Careful: an NMI might modify the previous counter value.
69 *
70 * Our tactic to handle this is to first atomically read and
71 * exchange a new raw count - then add that new-prev delta
72 * count to the generic counter atomically:
73 */
74again:
75 prev_raw_count = atomic64_read(&hwc->prev_count);
76 rdmsrl(hwc->counter_base + idx, new_raw_count);
77
78 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
79 new_raw_count) != prev_raw_count)
80 goto again;
81
82 /*
83 * Now we have the new raw value and have updated the prev
84 * timestamp already. We can now calculate the elapsed delta
85 * (counter-)time and add that to the generic counter.
86 *
87 * Careful, not all hw sign-extends above the physical width
88 * of the count, so we do that by clipping the delta to 32 bits:
89 */
90 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
91
92 atomic64_add(delta, &counter->count);
93 atomic64_sub(delta, &hwc->period_left);
94}
95
96/*
97 * Setup the hardware configuration for a given hw_event_type
98 */
99static int __hw_perf_counter_init(struct perf_counter *counter)
100{
101 struct perf_counter_hw_event *hw_event = &counter->hw_event;
102 struct hw_perf_counter *hwc = &counter->hw;
103
104 if (unlikely(!perf_counters_initialized))
105 return -EINVAL;
106
107 /*
108 * Count user events, and generate PMC IRQs:
109 * (keep 'enabled' bit clear for now)
110 */
111 hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
112
113 /*
114 * If privileged enough, count OS events too, and allow
115 * NMI events as well:
116 */
117 hwc->nmi = 0;
118 if (capable(CAP_SYS_ADMIN)) {
119 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
120 if (hw_event->nmi)
121 hwc->nmi = 1;
122 }
123
124 hwc->irq_period = hw_event->irq_period;
125 /*
126 * Intel PMCs cannot be accessed sanely above 32 bit width,
127 * so we install an artificial 1<<31 period regardless of
128 * the generic counter period:
129 */
130 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
131 hwc->irq_period = 0x7FFFFFFF;
132
133 atomic64_set(&hwc->period_left, hwc->irq_period);
134
135 /*
136 * Raw event type provide the config in the event structure
137 */
138 if (hw_event->raw) {
139 hwc->config |= hw_event->type;
140 } else {
141 if (hw_event->type >= max_intel_perfmon_events)
142 return -EINVAL;
143 /*
144 * The generic map:
145 */
146 hwc->config |= intel_perfmon_event_map[hw_event->type];
147 }
148 counter->wakeup_pending = 0;
149
150 return 0;
151}
152
153u64 hw_perf_save_disable(void)
154{
155 u64 ctrl;
156
157 if (unlikely(!perf_counters_initialized))
158 return 0;
159
160 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
161 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
162
163 return ctrl;
164}
165EXPORT_SYMBOL_GPL(hw_perf_save_disable);
166
167void hw_perf_restore(u64 ctrl)
168{
169 if (unlikely(!perf_counters_initialized))
170 return;
171
172 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
173}
174EXPORT_SYMBOL_GPL(hw_perf_restore);
175
176static inline void
177__pmc_fixed_disable(struct perf_counter *counter,
178 struct hw_perf_counter *hwc, unsigned int __idx)
179{
180 int idx = __idx - X86_PMC_IDX_FIXED;
181 u64 ctrl_val, mask;
182 int err;
183
184 mask = 0xfULL << (idx * 4);
185
186 rdmsrl(hwc->config_base, ctrl_val);
187 ctrl_val &= ~mask;
188 err = checking_wrmsrl(hwc->config_base, ctrl_val);
189}
190
191static inline void
192__pmc_generic_disable(struct perf_counter *counter,
193 struct hw_perf_counter *hwc, unsigned int idx)
194{
195 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
196 __pmc_fixed_disable(counter, hwc, idx);
197 else
198 wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
199}
200
201static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
202
203/*
204 * Set the next IRQ period, based on the hwc->period_left value.
205 * To be called with the counter disabled in hw:
206 */
207static void
208__hw_perf_counter_set_period(struct perf_counter *counter,
209 struct hw_perf_counter *hwc, int idx)
210{
211 s64 left = atomic64_read(&hwc->period_left);
212 s32 period = hwc->irq_period;
213 int err;
214
215 /*
216 * If we are way outside a reasoable range then just skip forward:
217 */
218 if (unlikely(left <= -period)) {
219 left = period;
220 atomic64_set(&hwc->period_left, left);
221 }
222
223 if (unlikely(left <= 0)) {
224 left += period;
225 atomic64_set(&hwc->period_left, left);
226 }
227
228 per_cpu(prev_left[idx], smp_processor_id()) = left;
229
230 /*
231 * The hw counter starts counting from this counter offset,
232 * mark it to be able to extra future deltas:
233 */
234 atomic64_set(&hwc->prev_count, (u64)-left);
235
236 err = checking_wrmsrl(hwc->counter_base + idx,
237 (u64)(-left) & counter_value_mask);
238}
239
240static inline void
241__pmc_fixed_enable(struct perf_counter *counter,
242 struct hw_perf_counter *hwc, unsigned int __idx)
243{
244 int idx = __idx - X86_PMC_IDX_FIXED;
245 u64 ctrl_val, bits, mask;
246 int err;
247
248 /*
249 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
250 * and enable ring-0 counting if allowed:
251 */
252 bits = 0x8ULL | 0x2ULL;
253 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
254 bits |= 0x1;
255 bits <<= (idx * 4);
256 mask = 0xfULL << (idx * 4);
257
258 rdmsrl(hwc->config_base, ctrl_val);
259 ctrl_val &= ~mask;
260 ctrl_val |= bits;
261 err = checking_wrmsrl(hwc->config_base, ctrl_val);
262}
263
264static void
265__pmc_generic_enable(struct perf_counter *counter,
266 struct hw_perf_counter *hwc, int idx)
267{
268 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
269 __pmc_fixed_enable(counter, hwc, idx);
270 else
271 wrmsr(hwc->config_base + idx,
272 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
273}
274
275static int
276fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
277{
278 unsigned int event;
279
280 if (unlikely(hwc->nmi))
281 return -1;
282
283 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
284
285 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
286 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
287 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
288 return X86_PMC_IDX_FIXED_CPU_CYCLES;
289 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
290 return X86_PMC_IDX_FIXED_BUS_CYCLES;
291
292 return -1;
293}
294
295/*
296 * Find a PMC slot for the freshly enabled / scheduled in counter:
297 */
298static int pmc_generic_enable(struct perf_counter *counter)
299{
300 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
301 struct hw_perf_counter *hwc = &counter->hw;
302 int idx;
303
304 idx = fixed_mode_idx(counter, hwc);
305 if (idx >= 0) {
306 /*
307 * Try to get the fixed counter, if that is already taken
308 * then try to get a generic counter:
309 */
310 if (test_and_set_bit(idx, cpuc->used))
311 goto try_generic;
312
313 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
314 /*
315 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
316 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
317 */
318 hwc->counter_base =
319 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
320 hwc->idx = idx;
321 } else {
322 idx = hwc->idx;
323 /* Try to get the previous generic counter again */
324 if (test_and_set_bit(idx, cpuc->used)) {
325try_generic:
326 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
327 if (idx == nr_counters_generic)
328 return -EAGAIN;
329
330 set_bit(idx, cpuc->used);
331 hwc->idx = idx;
332 }
333 hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
334 hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
335 }
336
337 perf_counters_lapic_init(hwc->nmi);
338
339 __pmc_generic_disable(counter, hwc, idx);
340
341 cpuc->counters[idx] = counter;
342 /*
343 * Make it visible before enabling the hw:
344 */
345 smp_wmb();
346
347 __hw_perf_counter_set_period(counter, hwc, idx);
348 __pmc_generic_enable(counter, hwc, idx);
349
350 return 0;
351}
352
353void perf_counter_print_debug(void)
354{
355 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
356 struct cpu_hw_counters *cpuc;
357 int cpu, idx;
358
359 if (!nr_counters_generic)
360 return;
361
362 local_irq_disable();
363
364 cpu = smp_processor_id();
365 cpuc = &per_cpu(cpu_hw_counters, cpu);
366
367 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
368 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
369 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
370 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
371
372 printk(KERN_INFO "\n");
373 printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
374 printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
375 printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
376 printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed);
377 printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
378
379 for (idx = 0; idx < nr_counters_generic; idx++) {
380 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
381 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
382
383 prev_left = per_cpu(prev_left[idx], cpu);
384
385 printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n",
386 cpu, idx, pmc_ctrl);
387 printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n",
388 cpu, idx, pmc_count);
389 printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n",
390 cpu, idx, prev_left);
391 }
392 for (idx = 0; idx < nr_counters_fixed; idx++) {
393 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
394
395 printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
396 cpu, idx, pmc_count);
397 }
398 local_irq_enable();
399}
400
401static void pmc_generic_disable(struct perf_counter *counter)
402{
403 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
404 struct hw_perf_counter *hwc = &counter->hw;
405 unsigned int idx = hwc->idx;
406
407 __pmc_generic_disable(counter, hwc, idx);
408
409 clear_bit(idx, cpuc->used);
410 cpuc->counters[idx] = NULL;
411 /*
412 * Make sure the cleared pointer becomes visible before we
413 * (potentially) free the counter:
414 */
415 smp_wmb();
416
417 /*
418 * Drain the remaining delta count out of a counter
419 * that we are disabling:
420 */
421 x86_perf_counter_update(counter, hwc, idx);
422}
423
424static void perf_store_irq_data(struct perf_counter *counter, u64 data)
425{
426 struct perf_data *irqdata = counter->irqdata;
427
428 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
429 irqdata->overrun++;
430 } else {
431 u64 *p = (u64 *) &irqdata->data[irqdata->len];
432
433 *p = data;
434 irqdata->len += sizeof(u64);
435 }
436}
437
438/*
439 * Save and restart an expired counter. Called by NMI contexts,
440 * so it has to be careful about preempting normal counter ops:
441 */
442static void perf_save_and_restart(struct perf_counter *counter)
443{
444 struct hw_perf_counter *hwc = &counter->hw;
445 int idx = hwc->idx;
446
447 x86_perf_counter_update(counter, hwc, idx);
448 __hw_perf_counter_set_period(counter, hwc, idx);
449
450 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
451 __pmc_generic_enable(counter, hwc, idx);
452}
453
454static void
455perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
456{
457 struct perf_counter *counter, *group_leader = sibling->group_leader;
458
459 /*
460 * Store sibling timestamps (if any):
461 */
462 list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
463
464 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
465 perf_store_irq_data(sibling, counter->hw_event.type);
466 perf_store_irq_data(sibling, atomic64_read(&counter->count));
467 }
468}
469
470/*
471 * This handler is triggered by the local APIC, so the APIC IRQ handling
472 * rules apply:
473 */
474static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
475{
476 int bit, cpu = smp_processor_id();
477 u64 ack, status, saved_global;
478 struct cpu_hw_counters *cpuc;
479
480 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
481
482 /* Disable counters globally */
483 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
484 ack_APIC_irq();
485
486 cpuc = &per_cpu(cpu_hw_counters, cpu);
487
488 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
489 if (!status)
490 goto out;
491
492again:
493 ack = status;
494 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
495 struct perf_counter *counter = cpuc->counters[bit];
496
497 clear_bit(bit, (unsigned long *) &status);
498 if (!counter)
499 continue;
500
501 perf_save_and_restart(counter);
502
503 switch (counter->hw_event.record_type) {
504 case PERF_RECORD_SIMPLE:
505 continue;
506 case PERF_RECORD_IRQ:
507 perf_store_irq_data(counter, instruction_pointer(regs));
508 break;
509 case PERF_RECORD_GROUP:
510 perf_handle_group(counter, &status, &ack);
511 break;
512 }
513 /*
514 * From NMI context we cannot call into the scheduler to
515 * do a task wakeup - but we mark these generic as
516 * wakeup_pending and initate a wakeup callback:
517 */
518 if (nmi) {
519 counter->wakeup_pending = 1;
520 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
521 } else {
522 wake_up(&counter->waitq);
523 }
524 }
525
526 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
527
528 /*
529 * Repeat if there is more work to be done:
530 */
531 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
532 if (status)
533 goto again;
534out:
535 /*
536 * Restore - do not reenable when global enable is off:
537 */
538 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
539}
540
541void smp_perf_counter_interrupt(struct pt_regs *regs)
542{
543 irq_enter();
544 inc_irq_stat(apic_perf_irqs);
545 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
546 __smp_perf_counter_interrupt(regs, 0);
547
548 irq_exit();
549}
550
551/*
552 * This handler is triggered by NMI contexts:
553 */
554void perf_counter_notify(struct pt_regs *regs)
555{
556 struct cpu_hw_counters *cpuc;
557 unsigned long flags;
558 int bit, cpu;
559
560 local_irq_save(flags);
561 cpu = smp_processor_id();
562 cpuc = &per_cpu(cpu_hw_counters, cpu);
563
564 for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
565 struct perf_counter *counter = cpuc->counters[bit];
566
567 if (!counter)
568 continue;
569
570 if (counter->wakeup_pending) {
571 counter->wakeup_pending = 0;
572 wake_up(&counter->waitq);
573 }
574 }
575
576 local_irq_restore(flags);
577}
578
579void __cpuinit perf_counters_lapic_init(int nmi)
580{
581 u32 apic_val;
582
583 if (!perf_counters_initialized)
584 return;
585 /*
586 * Enable the performance counter vector in the APIC LVT:
587 */
588 apic_val = apic_read(APIC_LVTERR);
589
590 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
591 if (nmi)
592 apic_write(APIC_LVTPC, APIC_DM_NMI);
593 else
594 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
595 apic_write(APIC_LVTERR, apic_val);
596}
597
598static int __kprobes
599perf_counter_nmi_handler(struct notifier_block *self,
600 unsigned long cmd, void *__args)
601{
602 struct die_args *args = __args;
603 struct pt_regs *regs;
604
605 if (likely(cmd != DIE_NMI_IPI))
606 return NOTIFY_DONE;
607
608 regs = args->regs;
609
610 apic_write(APIC_LVTPC, APIC_DM_NMI);
611 __smp_perf_counter_interrupt(regs, 1);
612
613 return NOTIFY_STOP;
614}
615
616static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
617 .notifier_call = perf_counter_nmi_handler
618};
619
620void __init init_hw_perf_counters(void)
621{
622 union cpuid10_eax eax;
623 unsigned int ebx;
624 unsigned int unused;
625 union cpuid10_edx edx;
626
627 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
628 return;
629
630 /*
631 * Check whether the Architectural PerfMon supports
632 * Branch Misses Retired Event or not.
633 */
634 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
635 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
636 return;
637
638 printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
639
640 printk(KERN_INFO "... version: %d\n", eax.split.version_id);
641 printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
642 nr_counters_generic = eax.split.num_counters;
643 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
644 nr_counters_generic = X86_PMC_MAX_GENERIC;
645 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
646 nr_counters_generic, X86_PMC_MAX_GENERIC);
647 }
648 perf_counter_mask = (1 << nr_counters_generic) - 1;
649 perf_max_counters = nr_counters_generic;
650
651 printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
652 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
653 printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask);
654
655 printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
656
657 nr_counters_fixed = edx.split.num_counters_fixed;
658 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
659 nr_counters_fixed = X86_PMC_MAX_FIXED;
660 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
661 nr_counters_fixed, X86_PMC_MAX_FIXED);
662 }
663 printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed);
664
665 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
666
667 printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask);
668 perf_counters_initialized = true;
669
670 perf_counters_lapic_init(0);
671 register_die_notifier(&perf_counter_nmi_notifier);
672}
673
674static void pmc_generic_read(struct perf_counter *counter)
675{
676 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
677}
678
679static const struct hw_perf_counter_ops x86_perf_counter_ops = {
680 .enable = pmc_generic_enable,
681 .disable = pmc_generic_disable,
682 .read = pmc_generic_read,
683};
684
685const struct hw_perf_counter_ops *
686hw_perf_counter_init(struct perf_counter *counter)
687{
688 int err;
689
690 err = __hw_perf_counter_init(counter);
691 if (err)
692 return NULL;
693
694 return &x86_perf_counter_ops;
695}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c52b60919163..c092e7d2686d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1025,6 +1025,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1026 spurious_interrupt smp_spurious_interrupt
1027 1027
1028#ifdef CONFIG_PERF_COUNTERS
1029apicinterrupt LOCAL_PERF_VECTOR \
1030 perf_counter_interrupt smp_perf_counter_interrupt
1031#endif
1032
1028/* 1033/*
1029 * Exception entry points. 1034 * Exception entry points.
1030 */ 1035 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 8b30d0c2512c..a6bca1d33a8a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -53,6 +53,10 @@ static int show_other_interrupts(struct seq_file *p)
53 for_each_online_cpu(j) 53 for_each_online_cpu(j)
54 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 54 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
55 seq_printf(p, " Local timer interrupts\n"); 55 seq_printf(p, " Local timer interrupts\n");
56 seq_printf(p, "CNT: ");
57 for_each_online_cpu(j)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
59 seq_printf(p, " Performance counter interrupts\n");
56#endif 60#endif
57#ifdef CONFIG_SMP 61#ifdef CONFIG_SMP
58 seq_printf(p, "RES: "); 62 seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
160 164
161#ifdef CONFIG_X86_LOCAL_APIC 165#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs; 166 sum += irq_stats(cpu)->apic_timer_irqs;
167 sum += irq_stats(cpu)->apic_perf_irqs;
163#endif 168#endif
164#ifdef CONFIG_SMP 169#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count; 170 sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1507ad4e674d..0bef6280f30c 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -171,6 +171,9 @@ void __init native_init_IRQ(void)
171 /* IPI vectors for APIC spurious and error interrupts */ 171 /* IPI vectors for APIC spurious and error interrupts */
172 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 172 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
173 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 173 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
174# ifdef CONFIG_PERF_COUNTERS
175 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
176# endif
174#endif 177#endif
175 178
176#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 179#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..6a71bfc51e51 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -150,6 +150,11 @@ static void __init apic_intr_init(void)
150 /* IPI vectors for APIC spurious and error interrupts */ 150 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
153
154 /* Performance monitoring interrupt: */
155#ifdef CONFIG_PERF_COUNTERS
156 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
157#endif
153} 158}
154 159
155void __init native_init_IRQ(void) 160void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 89bb7668041d..4fa5243c2069 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9 9#include <linux/perf_counter.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
@@ -886,6 +886,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
886 tracehook_notify_resume(regs); 886 tracehook_notify_resume(regs);
887 } 887 }
888 888
889 if (thread_info_flags & _TIF_PERF_COUNTERS) {
890 clear_thread_flag(TIF_PERF_COUNTERS);
891 perf_counter_notify(regs);
892 }
893
889#ifdef CONFIG_X86_32 894#ifdef CONFIG_X86_32
890 clear_thread_flag(TIF_IRET); 895 clear_thread_flag(TIF_IRET);
891#endif /* CONFIG_X86_32 */ 896#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..496726ddcea1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf3..07c914555a5e 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 66a9d8145562..7acb23f830ce 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -271,8 +271,11 @@ static atomic_t c3_cpu_count;
271/* Common C-state entry for C2, C3, .. */ 271/* Common C-state entry for C2, C3, .. */
272static void acpi_cstate_enter(struct acpi_processor_cx *cstate) 272static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
273{ 273{
274 u64 perf_flags;
275
274 /* Don't trace irqs off for idle */ 276 /* Don't trace irqs off for idle */
275 stop_critical_timings(); 277 stop_critical_timings();
278 perf_flags = hw_perf_save_disable();
276 if (cstate->entry_method == ACPI_CSTATE_FFH) { 279 if (cstate->entry_method == ACPI_CSTATE_FFH) {
277 /* Call into architectural FFH based C-state */ 280 /* Call into architectural FFH based C-state */
278 acpi_processor_ffh_cstate_enter(cstate); 281 acpi_processor_ffh_cstate_enter(cstate);
@@ -285,6 +288,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
285 gets asserted in time to freeze execution properly. */ 288 gets asserted in time to freeze execution properly. */
286 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 289 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
287 } 290 }
291 hw_perf_restore(perf_flags);
288 start_critical_timings(); 292 start_critical_timings();
289} 293}
290#endif /* !CONFIG_CPU_IDLE */ 294#endif /* !CONFIG_CPU_IDLE */
@@ -1426,8 +1430,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1426 */ 1430 */
1427static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 1431static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1428{ 1432{
1433 u64 pctrl;
1434
1429 /* Don't trace irqs off for idle */ 1435 /* Don't trace irqs off for idle */
1430 stop_critical_timings(); 1436 stop_critical_timings();
1437 pctrl = hw_perf_save_disable();
1431 if (cx->entry_method == ACPI_CSTATE_FFH) { 1438 if (cx->entry_method == ACPI_CSTATE_FFH) {
1432 /* Call into architectural FFH based C-state */ 1439 /* Call into architectural FFH based C-state */
1433 acpi_processor_ffh_cstate_enter(cx); 1440 acpi_processor_ffh_cstate_enter(cx);
@@ -1442,6 +1449,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1442 gets asserted in time to freeze execution properly. */ 1449 gets asserted in time to freeze execution properly. */
1443 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 1450 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1444 } 1451 }
1452 hw_perf_restore(pctrl);
1445 start_critical_timings(); 1453 start_critical_timings();
1446} 1454}
1447 1455
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index d41b9f6f7903..5a3eab0882a0 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 71a6efe5d8bd..605be573fe87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -1010,6 +1011,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1010 1011
1011 current->personality &= ~bprm->per_clear; 1012 current->personality &= ~bprm->per_clear;
1012 1013
1014 /*
1015 * Flush performance counters when crossing a
1016 * security domain:
1017 */
1018 if (!get_dumpable(current->mm))
1019 perf_counter_exit_task(current);
1020
1013 /* An exec changes our domain. We are no longer part of the thread 1021 /* An exec changes our domain. We are no longer part of the thread
1014 group */ 1022 group */
1015 1023
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2f3c2d4ef73b..49a40fbc806b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -115,6 +115,16 @@ extern struct group_info init_groups;
115 115
116extern struct cred init_cred; 116extern struct cred init_cred;
117 117
118#ifdef CONFIG_PERF_COUNTERS
119# define INIT_PERF_COUNTERS(tsk) \
120 .perf_counter_ctx.counter_list = \
121 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
122 .perf_counter_ctx.lock = \
123 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
124#else
125# define INIT_PERF_COUNTERS(tsk)
126#endif
127
118/* 128/*
119 * INIT_TASK is used to set up the first task table, touch at 129 * INIT_TASK is used to set up the first task table, touch at
120 * your own risk!. Base=0, limit=0x1fffff (=2MB) 130 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -179,6 +189,7 @@ extern struct cred init_cred;
179 INIT_IDS \ 189 INIT_IDS \
180 INIT_TRACE_IRQFLAGS \ 190 INIT_TRACE_IRQFLAGS \
181 INIT_LOCKDEP \ 191 INIT_LOCKDEP \
192 INIT_PERF_COUNTERS(tsk) \
182} 193}
183 194
184 195
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d20413119..ecfa66817634 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
78 return sum; 78 return sum;
79} 79}
80 80
81
82/*
83 * Lock/unlock the current runqueue - to extract task statistics:
84 */
85extern void curr_rq_lock_irq_save(unsigned long *flags);
86extern void curr_rq_unlock_irq_restore(unsigned long *flags);
87extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
81extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
82extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
83extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
84extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..33ba9fe0a781
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,290 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <asm/atomic.h>
17#include <asm/ioctl.h>
18
19#ifdef CONFIG_PERF_COUNTERS
20# include <asm/perf_counter.h>
21#endif
22
23#include <linux/list.h>
24#include <linux/mutex.h>
25#include <linux/rculist.h>
26#include <linux/rcupdate.h>
27#include <linux/spinlock.h>
28
29struct task_struct;
30
31/*
32 * User-space ABI bits:
33 */
34
35/*
36 * Generalized performance counter event types, used by the hw_event.type
37 * parameter of the sys_perf_counter_open() syscall:
38 */
39enum hw_event_types {
40 /*
41 * Common hardware events, generalized by the kernel:
42 */
43 PERF_COUNT_CPU_CYCLES = 0,
44 PERF_COUNT_INSTRUCTIONS = 1,
45 PERF_COUNT_CACHE_REFERENCES = 2,
46 PERF_COUNT_CACHE_MISSES = 3,
47 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
48 PERF_COUNT_BRANCH_MISSES = 5,
49 PERF_COUNT_BUS_CYCLES = 6,
50
51 PERF_HW_EVENTS_MAX = 7,
52
53 /*
54 * Special "software" counters provided by the kernel, even if
55 * the hardware does not support performance counters. These
56 * counters measure various physical and sw events of the
57 * kernel (and allow the profiling of them as well):
58 */
59 PERF_COUNT_CPU_CLOCK = -1,
60 PERF_COUNT_TASK_CLOCK = -2,
61 PERF_COUNT_PAGE_FAULTS = -3,
62 PERF_COUNT_CONTEXT_SWITCHES = -4,
63 PERF_COUNT_CPU_MIGRATIONS = -5,
64
65 PERF_SW_EVENTS_MIN = -6,
66};
67
68/*
69 * IRQ-notification data record type:
70 */
71enum perf_counter_record_type {
72 PERF_RECORD_SIMPLE = 0,
73 PERF_RECORD_IRQ = 1,
74 PERF_RECORD_GROUP = 2,
75};
76
77/*
78 * Hardware event to monitor via a performance monitoring counter:
79 */
80struct perf_counter_hw_event {
81 s64 type;
82
83 u64 irq_period;
84 u32 record_type;
85
86 u32 disabled : 1, /* off by default */
87 nmi : 1, /* NMI sampling */
88 raw : 1, /* raw event type */
89 inherit : 1, /* children inherit it */
90 pinned : 1, /* must always be on PMU */
91 exclusive : 1, /* only counter on PMU */
92
93 __reserved_1 : 26;
94
95 u64 __reserved_2;
96};
97
98/*
99 * Ioctls that can be done on a perf counter fd:
100 */
101#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
102#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
103
104/*
105 * Kernel-internal data types:
106 */
107
108/**
109 * struct hw_perf_counter - performance counter hardware details:
110 */
111struct hw_perf_counter {
112#ifdef CONFIG_PERF_COUNTERS
113 u64 config;
114 unsigned long config_base;
115 unsigned long counter_base;
116 int nmi;
117 unsigned int idx;
118 atomic64_t prev_count;
119 u64 irq_period;
120 atomic64_t period_left;
121#endif
122};
123
124/*
125 * Hardcoded buffer length limit for now, for IRQ-fed events:
126 */
127#define PERF_DATA_BUFLEN 2048
128
129/**
130 * struct perf_data - performance counter IRQ data sampling ...
131 */
132struct perf_data {
133 int len;
134 int rd_idx;
135 int overrun;
136 u8 data[PERF_DATA_BUFLEN];
137};
138
139struct perf_counter;
140
141/**
142 * struct hw_perf_counter_ops - performance counter hw ops
143 */
144struct hw_perf_counter_ops {
145 int (*enable) (struct perf_counter *counter);
146 void (*disable) (struct perf_counter *counter);
147 void (*read) (struct perf_counter *counter);
148};
149
150/**
151 * enum perf_counter_active_state - the states of a counter
152 */
153enum perf_counter_active_state {
154 PERF_COUNTER_STATE_ERROR = -2,
155 PERF_COUNTER_STATE_OFF = -1,
156 PERF_COUNTER_STATE_INACTIVE = 0,
157 PERF_COUNTER_STATE_ACTIVE = 1,
158};
159
160struct file;
161
162/**
163 * struct perf_counter - performance counter kernel representation:
164 */
165struct perf_counter {
166#ifdef CONFIG_PERF_COUNTERS
167 struct list_head list_entry;
168 struct list_head sibling_list;
169 struct perf_counter *group_leader;
170 const struct hw_perf_counter_ops *hw_ops;
171
172 enum perf_counter_active_state state;
173 atomic64_t count;
174
175 struct perf_counter_hw_event hw_event;
176 struct hw_perf_counter hw;
177
178 struct perf_counter_context *ctx;
179 struct task_struct *task;
180 struct file *filp;
181
182 struct perf_counter *parent;
183 struct list_head child_list;
184
185 /*
186 * Protect attach/detach and child_list:
187 */
188 struct mutex mutex;
189
190 int oncpu;
191 int cpu;
192
193 /* read() / irq related data */
194 wait_queue_head_t waitq;
195 /* optional: for NMIs */
196 int wakeup_pending;
197 struct perf_data *irqdata;
198 struct perf_data *usrdata;
199 struct perf_data data[2];
200#endif
201};
202
203/**
204 * struct perf_counter_context - counter context structure
205 *
206 * Used as a container for task counters and CPU counters as well:
207 */
208struct perf_counter_context {
209#ifdef CONFIG_PERF_COUNTERS
210 /*
211 * Protect the states of the counters in the list,
212 * nr_active, and the list:
213 */
214 spinlock_t lock;
215 /*
216 * Protect the list of counters. Locking either mutex or lock
217 * is sufficient to ensure the list doesn't change; to change
218 * the list you need to lock both the mutex and the spinlock.
219 */
220 struct mutex mutex;
221
222 struct list_head counter_list;
223 int nr_counters;
224 int nr_active;
225 int is_active;
226 struct task_struct *task;
227#endif
228};
229
230/**
231 * struct perf_counter_cpu_context - per cpu counter context structure
232 */
233struct perf_cpu_context {
234 struct perf_counter_context ctx;
235 struct perf_counter_context *task_ctx;
236 int active_oncpu;
237 int max_pertask;
238 int exclusive;
239};
240
241/*
242 * Set by architecture code:
243 */
244extern int perf_max_counters;
245
246#ifdef CONFIG_PERF_COUNTERS
247extern const struct hw_perf_counter_ops *
248hw_perf_counter_init(struct perf_counter *counter);
249
250extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
251extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
252extern void perf_counter_task_tick(struct task_struct *task, int cpu);
253extern void perf_counter_init_task(struct task_struct *child);
254extern void perf_counter_exit_task(struct task_struct *child);
255extern void perf_counter_notify(struct pt_regs *regs);
256extern void perf_counter_print_debug(void);
257extern u64 hw_perf_save_disable(void);
258extern void hw_perf_restore(u64 ctrl);
259extern int perf_counter_task_disable(void);
260extern int perf_counter_task_enable(void);
261extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
262 struct perf_cpu_context *cpuctx,
263 struct perf_counter_context *ctx, int cpu);
264
265/*
266 * Return 1 for a software counter, 0 for a hardware counter
267 */
268static inline int is_software_counter(struct perf_counter *counter)
269{
270 return !counter->hw_event.raw && counter->hw_event.type < 0;
271}
272
273#else
274static inline void
275perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
276static inline void
277perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
278static inline void
279perf_counter_task_tick(struct task_struct *task, int cpu) { }
280static inline void perf_counter_init_task(struct task_struct *child) { }
281static inline void perf_counter_exit_task(struct task_struct *child) { }
282static inline void perf_counter_notify(struct pt_regs *regs) { }
283static inline void perf_counter_print_debug(void) { }
284static inline void hw_perf_restore(u64 ctrl) { }
285static inline u64 hw_perf_save_disable(void) { return 0; }
286static inline int perf_counter_task_disable(void) { return -EINVAL; }
287static inline int perf_counter_task_enable(void) { return -EINVAL; }
288#endif
289
290#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..f134a0f7080a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -1031,6 +1032,8 @@ struct sched_entity {
1031 u64 last_wakeup; 1032 u64 last_wakeup;
1032 u64 avg_overlap; 1033 u64 avg_overlap;
1033 1034
1035 u64 nr_migrations;
1036
1034#ifdef CONFIG_SCHEDSTATS 1037#ifdef CONFIG_SCHEDSTATS
1035 u64 wait_start; 1038 u64 wait_start;
1036 u64 wait_max; 1039 u64 wait_max;
@@ -1046,7 +1049,6 @@ struct sched_entity {
1046 u64 exec_max; 1049 u64 exec_max;
1047 u64 slice_max; 1050 u64 slice_max;
1048 1051
1049 u64 nr_migrations;
1050 u64 nr_migrations_cold; 1052 u64 nr_migrations_cold;
1051 u64 nr_failed_migrations_affine; 1053 u64 nr_failed_migrations_affine;
1052 u64 nr_failed_migrations_running; 1054 u64 nr_failed_migrations_running;
@@ -1349,6 +1351,7 @@ struct task_struct {
1349 struct list_head pi_state_list; 1351 struct list_head pi_state_list;
1350 struct futex_pi_state *pi_state_cache; 1352 struct futex_pi_state *pi_state_cache;
1351#endif 1353#endif
1354 struct perf_counter_context perf_counter_ctx;
1352#ifdef CONFIG_NUMA 1355#ifdef CONFIG_NUMA
1353 struct mempolicy *mempolicy; 1356 struct mempolicy *mempolicy;
1354 short il_next; 1357 short il_next;
@@ -2322,6 +2325,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2322#define TASK_SIZE_OF(tsk) TASK_SIZE 2325#define TASK_SIZE_OF(tsk) TASK_SIZE
2323#endif 2326#endif
2324 2327
2328/*
2329 * Call the function if the target task is executing on a CPU right now:
2330 */
2331extern void task_oncpu_function_call(struct task_struct *p,
2332 void (*func) (void *info), void *info);
2333
2334
2325#ifdef CONFIG_MM_OWNER 2335#ifdef CONFIG_MM_OWNER
2326extern void mm_update_next_owner(struct mm_struct *mm); 2336extern void mm_update_next_owner(struct mm_struct *mm);
2327extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2337extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 18d0a243a7b3..a1d177ce0a08 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
54struct compat_timeval; 54struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct perf_counter_hw_event;
57 58
58#include <linux/types.h> 59#include <linux/types.h>
59#include <linux/aio_abi.h> 60#include <linux/aio_abi.h>
@@ -624,4 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
624 625
625int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 626int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
626 627
628
629asmlinkage int sys_perf_counter_open(
630
631 struct perf_counter_hw_event *hw_event_uptr __user,
632 pid_t pid,
633 int cpu,
634 int group_fd);
627#endif 635#endif
diff --git a/init/Kconfig b/init/Kconfig
index a724a149bf3f..a588cdc274bc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -777,6 +777,36 @@ config AIO
777 by some high performance threaded applications. Disabling 777 by some high performance threaded applications. Disabling
778 this option saves about 7k. 778 this option saves about 7k.
779 779
780config HAVE_PERF_COUNTERS
781 bool
782
783menu "Performance Counters"
784
785config PERF_COUNTERS
786 bool "Kernel Performance Counters"
787 depends on HAVE_PERF_COUNTERS
788 default y
789 select ANON_INODES
790 help
791 Enable kernel support for performance counter hardware.
792
793 Performance counters are special hardware registers available
794 on most modern CPUs. These registers count the number of certain
795 types of hw events: such as instructions executed, cachemisses
796 suffered, or branches mis-predicted - without slowing down the
797 kernel or applications. These registers can also trigger interrupts
798 when a threshold number of events have passed - and can thus be
799 used to profile the code that runs on that CPU.
800
801 The Linux Performance Counter subsystem provides an abstraction of
802 these hardware capabilities, available via a system call. It
803 provides per task and per CPU counters, and it provides event
804 capabilities on top of those.
805
806 Say Y if unsure.
807
808endmenu
809
780config VM_EVENT_COUNTERS 810config VM_EVENT_COUNTERS
781 default y 811 default y
782 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 812 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd7878..e4115926c536 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
96obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
96 97
97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 98ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 99# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index c7740fa3252c..cbdb39a498eb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,6 +159,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
159{ 159{
160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
161 161
162#ifdef CONFIG_PERF_COUNTERS
163 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
164#endif
162 trace_sched_process_free(tsk); 165 trace_sched_process_free(tsk);
163 put_task_struct(tsk); 166 put_task_struct(tsk);
164} 167}
@@ -1093,10 +1096,6 @@ NORET_TYPE void do_exit(long code)
1093 tsk->mempolicy = NULL; 1096 tsk->mempolicy = NULL;
1094#endif 1097#endif
1095#ifdef CONFIG_FUTEX 1098#ifdef CONFIG_FUTEX
1096 /*
1097 * This must happen late, after the PID is not
1098 * hashed anymore:
1099 */
1100 if (unlikely(!list_empty(&tsk->pi_state_list))) 1099 if (unlikely(!list_empty(&tsk->pi_state_list)))
1101 exit_pi_state_list(tsk); 1100 exit_pi_state_list(tsk);
1102 if (unlikely(current->pi_state_cache)) 1101 if (unlikely(current->pi_state_cache))
@@ -1361,6 +1360,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1361 */ 1360 */
1362 read_unlock(&tasklist_lock); 1361 read_unlock(&tasklist_lock);
1363 1362
1363 /*
1364 * Flush inherited counters to the parent - before the parent
1365 * gets woken up by child-exit notifications.
1366 */
1367 perf_counter_exit_task(p);
1368
1364 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1369 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1365 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1370 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1366 ? p->signal->group_exit_code : p->exit_code; 1371 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..b1f8609287eb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -985,6 +985,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
985 goto fork_out; 985 goto fork_out;
986 986
987 rt_mutex_init_task(p); 987 rt_mutex_init_task(p);
988 perf_counter_init_task(p);
988 989
989#ifdef CONFIG_PROVE_LOCKING 990#ifdef CONFIG_PROVE_LOCKING
990 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 991 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..1ac18daa424f
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,2169 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/file.h>
14#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/kernel_stat.h>
22#include <linux/perf_counter.h>
23
24/*
25 * Each CPU has a list of per CPU counters:
26 */
27DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
28
29int perf_max_counters __read_mostly = 1;
30static int perf_reserved_percpu __read_mostly;
31static int perf_overcommit __read_mostly = 1;
32
33/*
34 * Mutex for (sysadmin-configurable) counter reservations:
35 */
36static DEFINE_MUTEX(perf_resource_mutex);
37
38/*
39 * Architecture provided APIs - weak aliases:
40 */
41extern __weak const struct hw_perf_counter_ops *
42hw_perf_counter_init(struct perf_counter *counter)
43{
44 return NULL;
45}
46
47u64 __weak hw_perf_save_disable(void) { return 0; }
48void __weak hw_perf_restore(u64 ctrl) { barrier(); }
49void __weak hw_perf_counter_setup(int cpu) { barrier(); }
50int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
51 struct perf_cpu_context *cpuctx,
52 struct perf_counter_context *ctx, int cpu)
53{
54 return 0;
55}
56
57void __weak perf_counter_print_debug(void) { }
58
59static void
60list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
61{
62 struct perf_counter *group_leader = counter->group_leader;
63
64 /*
65 * Depending on whether it is a standalone or sibling counter,
66 * add it straight to the context's counter list, or to the group
67 * leader's sibling list:
68 */
69 if (counter->group_leader == counter)
70 list_add_tail(&counter->list_entry, &ctx->counter_list);
71 else
72 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
73}
74
75static void
76list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
77{
78 struct perf_counter *sibling, *tmp;
79
80 list_del_init(&counter->list_entry);
81
82 /*
83 * If this was a group counter with sibling counters then
84 * upgrade the siblings to singleton counters by adding them
85 * to the context list directly:
86 */
87 list_for_each_entry_safe(sibling, tmp,
88 &counter->sibling_list, list_entry) {
89
90 list_del_init(&sibling->list_entry);
91 list_add_tail(&sibling->list_entry, &ctx->counter_list);
92 sibling->group_leader = sibling;
93 }
94}
95
96static void
97counter_sched_out(struct perf_counter *counter,
98 struct perf_cpu_context *cpuctx,
99 struct perf_counter_context *ctx)
100{
101 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
102 return;
103
104 counter->state = PERF_COUNTER_STATE_INACTIVE;
105 counter->hw_ops->disable(counter);
106 counter->oncpu = -1;
107
108 if (!is_software_counter(counter))
109 cpuctx->active_oncpu--;
110 ctx->nr_active--;
111 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
112 cpuctx->exclusive = 0;
113}
114
115static void
116group_sched_out(struct perf_counter *group_counter,
117 struct perf_cpu_context *cpuctx,
118 struct perf_counter_context *ctx)
119{
120 struct perf_counter *counter;
121
122 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
123 return;
124
125 counter_sched_out(group_counter, cpuctx, ctx);
126
127 /*
128 * Schedule out siblings (if any):
129 */
130 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
131 counter_sched_out(counter, cpuctx, ctx);
132
133 if (group_counter->hw_event.exclusive)
134 cpuctx->exclusive = 0;
135}
136
137/*
138 * Cross CPU call to remove a performance counter
139 *
140 * We disable the counter on the hardware level first. After that we
141 * remove it from the context list.
142 */
143static void __perf_counter_remove_from_context(void *info)
144{
145 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
146 struct perf_counter *counter = info;
147 struct perf_counter_context *ctx = counter->ctx;
148 unsigned long flags;
149 u64 perf_flags;
150
151 /*
152 * If this is a task context, we need to check whether it is
153 * the current task context of this cpu. If not it has been
154 * scheduled out before the smp call arrived.
155 */
156 if (ctx->task && cpuctx->task_ctx != ctx)
157 return;
158
159 curr_rq_lock_irq_save(&flags);
160 spin_lock(&ctx->lock);
161
162 counter_sched_out(counter, cpuctx, ctx);
163
164 counter->task = NULL;
165 ctx->nr_counters--;
166
167 /*
168 * Protect the list operation against NMI by disabling the
169 * counters on a global level. NOP for non NMI based counters.
170 */
171 perf_flags = hw_perf_save_disable();
172 list_del_counter(counter, ctx);
173 hw_perf_restore(perf_flags);
174
175 if (!ctx->task) {
176 /*
177 * Allow more per task counters with respect to the
178 * reservation:
179 */
180 cpuctx->max_pertask =
181 min(perf_max_counters - ctx->nr_counters,
182 perf_max_counters - perf_reserved_percpu);
183 }
184
185 spin_unlock(&ctx->lock);
186 curr_rq_unlock_irq_restore(&flags);
187}
188
189
190/*
191 * Remove the counter from a task's (or a CPU's) list of counters.
192 *
193 * Must be called with counter->mutex and ctx->mutex held.
194 *
195 * CPU counters are removed with a smp call. For task counters we only
196 * call when the task is on a CPU.
197 */
198static void perf_counter_remove_from_context(struct perf_counter *counter)
199{
200 struct perf_counter_context *ctx = counter->ctx;
201 struct task_struct *task = ctx->task;
202
203 if (!task) {
204 /*
205 * Per cpu counters are removed via an smp call and
206 * the removal is always sucessful.
207 */
208 smp_call_function_single(counter->cpu,
209 __perf_counter_remove_from_context,
210 counter, 1);
211 return;
212 }
213
214retry:
215 task_oncpu_function_call(task, __perf_counter_remove_from_context,
216 counter);
217
218 spin_lock_irq(&ctx->lock);
219 /*
220 * If the context is active we need to retry the smp call.
221 */
222 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
223 spin_unlock_irq(&ctx->lock);
224 goto retry;
225 }
226
227 /*
228 * The lock prevents that this context is scheduled in so we
229 * can remove the counter safely, if the call above did not
230 * succeed.
231 */
232 if (!list_empty(&counter->list_entry)) {
233 ctx->nr_counters--;
234 list_del_counter(counter, ctx);
235 counter->task = NULL;
236 }
237 spin_unlock_irq(&ctx->lock);
238}
239
240/*
241 * Cross CPU call to disable a performance counter
242 */
243static void __perf_counter_disable(void *info)
244{
245 struct perf_counter *counter = info;
246 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
247 struct perf_counter_context *ctx = counter->ctx;
248 unsigned long flags;
249
250 /*
251 * If this is a per-task counter, need to check whether this
252 * counter's task is the current task on this cpu.
253 */
254 if (ctx->task && cpuctx->task_ctx != ctx)
255 return;
256
257 curr_rq_lock_irq_save(&flags);
258 spin_lock(&ctx->lock);
259
260 /*
261 * If the counter is on, turn it off.
262 * If it is in error state, leave it in error state.
263 */
264 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
265 if (counter == counter->group_leader)
266 group_sched_out(counter, cpuctx, ctx);
267 else
268 counter_sched_out(counter, cpuctx, ctx);
269 counter->state = PERF_COUNTER_STATE_OFF;
270 }
271
272 spin_unlock(&ctx->lock);
273 curr_rq_unlock_irq_restore(&flags);
274}
275
276/*
277 * Disable a counter.
278 */
279static void perf_counter_disable(struct perf_counter *counter)
280{
281 struct perf_counter_context *ctx = counter->ctx;
282 struct task_struct *task = ctx->task;
283
284 if (!task) {
285 /*
286 * Disable the counter on the cpu that it's on
287 */
288 smp_call_function_single(counter->cpu, __perf_counter_disable,
289 counter, 1);
290 return;
291 }
292
293 retry:
294 task_oncpu_function_call(task, __perf_counter_disable, counter);
295
296 spin_lock_irq(&ctx->lock);
297 /*
298 * If the counter is still active, we need to retry the cross-call.
299 */
300 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
301 spin_unlock_irq(&ctx->lock);
302 goto retry;
303 }
304
305 /*
306 * Since we have the lock this context can't be scheduled
307 * in, so we can change the state safely.
308 */
309 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
310 counter->state = PERF_COUNTER_STATE_OFF;
311
312 spin_unlock_irq(&ctx->lock);
313}
314
315/*
316 * Disable a counter and all its children.
317 */
318static void perf_counter_disable_family(struct perf_counter *counter)
319{
320 struct perf_counter *child;
321
322 perf_counter_disable(counter);
323
324 /*
325 * Lock the mutex to protect the list of children
326 */
327 mutex_lock(&counter->mutex);
328 list_for_each_entry(child, &counter->child_list, child_list)
329 perf_counter_disable(child);
330 mutex_unlock(&counter->mutex);
331}
332
333static int
334counter_sched_in(struct perf_counter *counter,
335 struct perf_cpu_context *cpuctx,
336 struct perf_counter_context *ctx,
337 int cpu)
338{
339 if (counter->state <= PERF_COUNTER_STATE_OFF)
340 return 0;
341
342 counter->state = PERF_COUNTER_STATE_ACTIVE;
343 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
344 /*
345 * The new state must be visible before we turn it on in the hardware:
346 */
347 smp_wmb();
348
349 if (counter->hw_ops->enable(counter)) {
350 counter->state = PERF_COUNTER_STATE_INACTIVE;
351 counter->oncpu = -1;
352 return -EAGAIN;
353 }
354
355 if (!is_software_counter(counter))
356 cpuctx->active_oncpu++;
357 ctx->nr_active++;
358
359 if (counter->hw_event.exclusive)
360 cpuctx->exclusive = 1;
361
362 return 0;
363}
364
365/*
366 * Return 1 for a group consisting entirely of software counters,
367 * 0 if the group contains any hardware counters.
368 */
369static int is_software_only_group(struct perf_counter *leader)
370{
371 struct perf_counter *counter;
372
373 if (!is_software_counter(leader))
374 return 0;
375 list_for_each_entry(counter, &leader->sibling_list, list_entry)
376 if (!is_software_counter(counter))
377 return 0;
378 return 1;
379}
380
381/*
382 * Work out whether we can put this counter group on the CPU now.
383 */
384static int group_can_go_on(struct perf_counter *counter,
385 struct perf_cpu_context *cpuctx,
386 int can_add_hw)
387{
388 /*
389 * Groups consisting entirely of software counters can always go on.
390 */
391 if (is_software_only_group(counter))
392 return 1;
393 /*
394 * If an exclusive group is already on, no other hardware
395 * counters can go on.
396 */
397 if (cpuctx->exclusive)
398 return 0;
399 /*
400 * If this group is exclusive and there are already
401 * counters on the CPU, it can't go on.
402 */
403 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
404 return 0;
405 /*
406 * Otherwise, try to add it if all previous groups were able
407 * to go on.
408 */
409 return can_add_hw;
410}
411
412/*
413 * Cross CPU call to install and enable a performance counter
414 */
415static void __perf_install_in_context(void *info)
416{
417 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
418 struct perf_counter *counter = info;
419 struct perf_counter_context *ctx = counter->ctx;
420 struct perf_counter *leader = counter->group_leader;
421 int cpu = smp_processor_id();
422 unsigned long flags;
423 u64 perf_flags;
424 int err;
425
426 /*
427 * If this is a task context, we need to check whether it is
428 * the current task context of this cpu. If not it has been
429 * scheduled out before the smp call arrived.
430 */
431 if (ctx->task && cpuctx->task_ctx != ctx)
432 return;
433
434 curr_rq_lock_irq_save(&flags);
435 spin_lock(&ctx->lock);
436
437 /*
438 * Protect the list operation against NMI by disabling the
439 * counters on a global level. NOP for non NMI based counters.
440 */
441 perf_flags = hw_perf_save_disable();
442
443 list_add_counter(counter, ctx);
444 ctx->nr_counters++;
445
446 /*
447 * Don't put the counter on if it is disabled or if
448 * it is in a group and the group isn't on.
449 */
450 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
451 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
452 goto unlock;
453
454 /*
455 * An exclusive counter can't go on if there are already active
456 * hardware counters, and no hardware counter can go on if there
457 * is already an exclusive counter on.
458 */
459 if (!group_can_go_on(counter, cpuctx, 1))
460 err = -EEXIST;
461 else
462 err = counter_sched_in(counter, cpuctx, ctx, cpu);
463
464 if (err) {
465 /*
466 * This counter couldn't go on. If it is in a group
467 * then we have to pull the whole group off.
468 * If the counter group is pinned then put it in error state.
469 */
470 if (leader != counter)
471 group_sched_out(leader, cpuctx, ctx);
472 if (leader->hw_event.pinned)
473 leader->state = PERF_COUNTER_STATE_ERROR;
474 }
475
476 if (!err && !ctx->task && cpuctx->max_pertask)
477 cpuctx->max_pertask--;
478
479 unlock:
480 hw_perf_restore(perf_flags);
481
482 spin_unlock(&ctx->lock);
483 curr_rq_unlock_irq_restore(&flags);
484}
485
486/*
487 * Attach a performance counter to a context
488 *
489 * First we add the counter to the list with the hardware enable bit
490 * in counter->hw_config cleared.
491 *
492 * If the counter is attached to a task which is on a CPU we use a smp
493 * call to enable it in the task context. The task might have been
494 * scheduled away, but we check this in the smp call again.
495 *
496 * Must be called with ctx->mutex held.
497 */
498static void
499perf_install_in_context(struct perf_counter_context *ctx,
500 struct perf_counter *counter,
501 int cpu)
502{
503 struct task_struct *task = ctx->task;
504
505 counter->ctx = ctx;
506 if (!task) {
507 /*
508 * Per cpu counters are installed via an smp call and
509 * the install is always sucessful.
510 */
511 smp_call_function_single(cpu, __perf_install_in_context,
512 counter, 1);
513 return;
514 }
515
516 counter->task = task;
517retry:
518 task_oncpu_function_call(task, __perf_install_in_context,
519 counter);
520
521 spin_lock_irq(&ctx->lock);
522 /*
523 * we need to retry the smp call.
524 */
525 if (ctx->is_active && list_empty(&counter->list_entry)) {
526 spin_unlock_irq(&ctx->lock);
527 goto retry;
528 }
529
530 /*
531 * The lock prevents that this context is scheduled in so we
532 * can add the counter safely, if it the call above did not
533 * succeed.
534 */
535 if (list_empty(&counter->list_entry)) {
536 list_add_counter(counter, ctx);
537 ctx->nr_counters++;
538 }
539 spin_unlock_irq(&ctx->lock);
540}
541
542/*
543 * Cross CPU call to enable a performance counter
544 */
545static void __perf_counter_enable(void *info)
546{
547 struct perf_counter *counter = info;
548 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
549 struct perf_counter_context *ctx = counter->ctx;
550 struct perf_counter *leader = counter->group_leader;
551 unsigned long flags;
552 int err;
553
554 /*
555 * If this is a per-task counter, need to check whether this
556 * counter's task is the current task on this cpu.
557 */
558 if (ctx->task && cpuctx->task_ctx != ctx)
559 return;
560
561 curr_rq_lock_irq_save(&flags);
562 spin_lock(&ctx->lock);
563
564 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
565 goto unlock;
566 counter->state = PERF_COUNTER_STATE_INACTIVE;
567
568 /*
569 * If the counter is in a group and isn't the group leader,
570 * then don't put it on unless the group is on.
571 */
572 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
573 goto unlock;
574
575 if (!group_can_go_on(counter, cpuctx, 1))
576 err = -EEXIST;
577 else
578 err = counter_sched_in(counter, cpuctx, ctx,
579 smp_processor_id());
580
581 if (err) {
582 /*
583 * If this counter can't go on and it's part of a
584 * group, then the whole group has to come off.
585 */
586 if (leader != counter)
587 group_sched_out(leader, cpuctx, ctx);
588 if (leader->hw_event.pinned)
589 leader->state = PERF_COUNTER_STATE_ERROR;
590 }
591
592 unlock:
593 spin_unlock(&ctx->lock);
594 curr_rq_unlock_irq_restore(&flags);
595}
596
597/*
598 * Enable a counter.
599 */
600static void perf_counter_enable(struct perf_counter *counter)
601{
602 struct perf_counter_context *ctx = counter->ctx;
603 struct task_struct *task = ctx->task;
604
605 if (!task) {
606 /*
607 * Enable the counter on the cpu that it's on
608 */
609 smp_call_function_single(counter->cpu, __perf_counter_enable,
610 counter, 1);
611 return;
612 }
613
614 spin_lock_irq(&ctx->lock);
615 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
616 goto out;
617
618 /*
619 * If the counter is in error state, clear that first.
620 * That way, if we see the counter in error state below, we
621 * know that it has gone back into error state, as distinct
622 * from the task having been scheduled away before the
623 * cross-call arrived.
624 */
625 if (counter->state == PERF_COUNTER_STATE_ERROR)
626 counter->state = PERF_COUNTER_STATE_OFF;
627
628 retry:
629 spin_unlock_irq(&ctx->lock);
630 task_oncpu_function_call(task, __perf_counter_enable, counter);
631
632 spin_lock_irq(&ctx->lock);
633
634 /*
635 * If the context is active and the counter is still off,
636 * we need to retry the cross-call.
637 */
638 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
639 goto retry;
640
641 /*
642 * Since we have the lock this context can't be scheduled
643 * in, so we can change the state safely.
644 */
645 if (counter->state == PERF_COUNTER_STATE_OFF)
646 counter->state = PERF_COUNTER_STATE_INACTIVE;
647 out:
648 spin_unlock_irq(&ctx->lock);
649}
650
651/*
652 * Enable a counter and all its children.
653 */
654static void perf_counter_enable_family(struct perf_counter *counter)
655{
656 struct perf_counter *child;
657
658 perf_counter_enable(counter);
659
660 /*
661 * Lock the mutex to protect the list of children
662 */
663 mutex_lock(&counter->mutex);
664 list_for_each_entry(child, &counter->child_list, child_list)
665 perf_counter_enable(child);
666 mutex_unlock(&counter->mutex);
667}
668
669void __perf_counter_sched_out(struct perf_counter_context *ctx,
670 struct perf_cpu_context *cpuctx)
671{
672 struct perf_counter *counter;
673 u64 flags;
674
675 spin_lock(&ctx->lock);
676 ctx->is_active = 0;
677 if (likely(!ctx->nr_counters))
678 goto out;
679
680 flags = hw_perf_save_disable();
681 if (ctx->nr_active) {
682 list_for_each_entry(counter, &ctx->counter_list, list_entry)
683 group_sched_out(counter, cpuctx, ctx);
684 }
685 hw_perf_restore(flags);
686 out:
687 spin_unlock(&ctx->lock);
688}
689
690/*
691 * Called from scheduler to remove the counters of the current task,
692 * with interrupts disabled.
693 *
694 * We stop each counter and update the counter value in counter->count.
695 *
696 * This does not protect us against NMI, but disable()
697 * sets the disabled bit in the control field of counter _before_
698 * accessing the counter control register. If a NMI hits, then it will
699 * not restart the counter.
700 */
701void perf_counter_task_sched_out(struct task_struct *task, int cpu)
702{
703 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
704 struct perf_counter_context *ctx = &task->perf_counter_ctx;
705
706 if (likely(!cpuctx->task_ctx))
707 return;
708
709 __perf_counter_sched_out(ctx, cpuctx);
710
711 cpuctx->task_ctx = NULL;
712}
713
714static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
715{
716 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
717}
718
719static int
720group_sched_in(struct perf_counter *group_counter,
721 struct perf_cpu_context *cpuctx,
722 struct perf_counter_context *ctx,
723 int cpu)
724{
725 struct perf_counter *counter, *partial_group;
726 int ret;
727
728 if (group_counter->state == PERF_COUNTER_STATE_OFF)
729 return 0;
730
731 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
732 if (ret)
733 return ret < 0 ? ret : 0;
734
735 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
736 return -EAGAIN;
737
738 /*
739 * Schedule in siblings as one group (if any):
740 */
741 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
742 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
743 partial_group = counter;
744 goto group_error;
745 }
746 }
747
748 return 0;
749
750group_error:
751 /*
752 * Groups can be scheduled in as one unit only, so undo any
753 * partial group before returning:
754 */
755 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
756 if (counter == partial_group)
757 break;
758 counter_sched_out(counter, cpuctx, ctx);
759 }
760 counter_sched_out(group_counter, cpuctx, ctx);
761
762 return -EAGAIN;
763}
764
765static void
766__perf_counter_sched_in(struct perf_counter_context *ctx,
767 struct perf_cpu_context *cpuctx, int cpu)
768{
769 struct perf_counter *counter;
770 u64 flags;
771 int can_add_hw = 1;
772
773 spin_lock(&ctx->lock);
774 ctx->is_active = 1;
775 if (likely(!ctx->nr_counters))
776 goto out;
777
778 flags = hw_perf_save_disable();
779
780 /*
781 * First go through the list and put on any pinned groups
782 * in order to give them the best chance of going on.
783 */
784 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
785 if (counter->state <= PERF_COUNTER_STATE_OFF ||
786 !counter->hw_event.pinned)
787 continue;
788 if (counter->cpu != -1 && counter->cpu != cpu)
789 continue;
790
791 if (group_can_go_on(counter, cpuctx, 1))
792 group_sched_in(counter, cpuctx, ctx, cpu);
793
794 /*
795 * If this pinned group hasn't been scheduled,
796 * put it in error state.
797 */
798 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
799 counter->state = PERF_COUNTER_STATE_ERROR;
800 }
801
802 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
803 /*
804 * Ignore counters in OFF or ERROR state, and
805 * ignore pinned counters since we did them already.
806 */
807 if (counter->state <= PERF_COUNTER_STATE_OFF ||
808 counter->hw_event.pinned)
809 continue;
810
811 /*
812 * Listen to the 'cpu' scheduling filter constraint
813 * of counters:
814 */
815 if (counter->cpu != -1 && counter->cpu != cpu)
816 continue;
817
818 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
819 if (group_sched_in(counter, cpuctx, ctx, cpu))
820 can_add_hw = 0;
821 }
822 }
823 hw_perf_restore(flags);
824 out:
825 spin_unlock(&ctx->lock);
826}
827
828/*
829 * Called from scheduler to add the counters of the current task
830 * with interrupts disabled.
831 *
832 * We restore the counter value and then enable it.
833 *
834 * This does not protect us against NMI, but enable()
835 * sets the enabled bit in the control field of counter _before_
836 * accessing the counter control register. If a NMI hits, then it will
837 * keep the counter running.
838 */
839void perf_counter_task_sched_in(struct task_struct *task, int cpu)
840{
841 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
842 struct perf_counter_context *ctx = &task->perf_counter_ctx;
843
844 __perf_counter_sched_in(ctx, cpuctx, cpu);
845 cpuctx->task_ctx = ctx;
846}
847
848static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
849{
850 struct perf_counter_context *ctx = &cpuctx->ctx;
851
852 __perf_counter_sched_in(ctx, cpuctx, cpu);
853}
854
855int perf_counter_task_disable(void)
856{
857 struct task_struct *curr = current;
858 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
859 struct perf_counter *counter;
860 unsigned long flags;
861 u64 perf_flags;
862 int cpu;
863
864 if (likely(!ctx->nr_counters))
865 return 0;
866
867 curr_rq_lock_irq_save(&flags);
868 cpu = smp_processor_id();
869
870 /* force the update of the task clock: */
871 __task_delta_exec(curr, 1);
872
873 perf_counter_task_sched_out(curr, cpu);
874
875 spin_lock(&ctx->lock);
876
877 /*
878 * Disable all the counters:
879 */
880 perf_flags = hw_perf_save_disable();
881
882 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
883 if (counter->state != PERF_COUNTER_STATE_ERROR)
884 counter->state = PERF_COUNTER_STATE_OFF;
885 }
886
887 hw_perf_restore(perf_flags);
888
889 spin_unlock(&ctx->lock);
890
891 curr_rq_unlock_irq_restore(&flags);
892
893 return 0;
894}
895
896int perf_counter_task_enable(void)
897{
898 struct task_struct *curr = current;
899 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
900 struct perf_counter *counter;
901 unsigned long flags;
902 u64 perf_flags;
903 int cpu;
904
905 if (likely(!ctx->nr_counters))
906 return 0;
907
908 curr_rq_lock_irq_save(&flags);
909 cpu = smp_processor_id();
910
911 /* force the update of the task clock: */
912 __task_delta_exec(curr, 1);
913
914 perf_counter_task_sched_out(curr, cpu);
915
916 spin_lock(&ctx->lock);
917
918 /*
919 * Disable all the counters:
920 */
921 perf_flags = hw_perf_save_disable();
922
923 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
924 if (counter->state > PERF_COUNTER_STATE_OFF)
925 continue;
926 counter->state = PERF_COUNTER_STATE_INACTIVE;
927 counter->hw_event.disabled = 0;
928 }
929 hw_perf_restore(perf_flags);
930
931 spin_unlock(&ctx->lock);
932
933 perf_counter_task_sched_in(curr, cpu);
934
935 curr_rq_unlock_irq_restore(&flags);
936
937 return 0;
938}
939
940/*
941 * Round-robin a context's counters:
942 */
943static void rotate_ctx(struct perf_counter_context *ctx)
944{
945 struct perf_counter *counter;
946 u64 perf_flags;
947
948 if (!ctx->nr_counters)
949 return;
950
951 spin_lock(&ctx->lock);
952 /*
953 * Rotate the first entry last (works just fine for group counters too):
954 */
955 perf_flags = hw_perf_save_disable();
956 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
957 list_del(&counter->list_entry);
958 list_add_tail(&counter->list_entry, &ctx->counter_list);
959 break;
960 }
961 hw_perf_restore(perf_flags);
962
963 spin_unlock(&ctx->lock);
964}
965
966void perf_counter_task_tick(struct task_struct *curr, int cpu)
967{
968 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
969 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
970 const int rotate_percpu = 0;
971
972 if (rotate_percpu)
973 perf_counter_cpu_sched_out(cpuctx);
974 perf_counter_task_sched_out(curr, cpu);
975
976 if (rotate_percpu)
977 rotate_ctx(&cpuctx->ctx);
978 rotate_ctx(ctx);
979
980 if (rotate_percpu)
981 perf_counter_cpu_sched_in(cpuctx, cpu);
982 perf_counter_task_sched_in(curr, cpu);
983}
984
985/*
986 * Cross CPU call to read the hardware counter
987 */
988static void __read(void *info)
989{
990 struct perf_counter *counter = info;
991 unsigned long flags;
992
993 curr_rq_lock_irq_save(&flags);
994 counter->hw_ops->read(counter);
995 curr_rq_unlock_irq_restore(&flags);
996}
997
998static u64 perf_counter_read(struct perf_counter *counter)
999{
1000 /*
1001 * If counter is enabled and currently active on a CPU, update the
1002 * value in the counter structure:
1003 */
1004 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1005 smp_call_function_single(counter->oncpu,
1006 __read, counter, 1);
1007 }
1008
1009 return atomic64_read(&counter->count);
1010}
1011
1012/*
1013 * Cross CPU call to switch performance data pointers
1014 */
1015static void __perf_switch_irq_data(void *info)
1016{
1017 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1018 struct perf_counter *counter = info;
1019 struct perf_counter_context *ctx = counter->ctx;
1020 struct perf_data *oldirqdata = counter->irqdata;
1021
1022 /*
1023 * If this is a task context, we need to check whether it is
1024 * the current task context of this cpu. If not it has been
1025 * scheduled out before the smp call arrived.
1026 */
1027 if (ctx->task) {
1028 if (cpuctx->task_ctx != ctx)
1029 return;
1030 spin_lock(&ctx->lock);
1031 }
1032
1033 /* Change the pointer NMI safe */
1034 atomic_long_set((atomic_long_t *)&counter->irqdata,
1035 (unsigned long) counter->usrdata);
1036 counter->usrdata = oldirqdata;
1037
1038 if (ctx->task)
1039 spin_unlock(&ctx->lock);
1040}
1041
1042static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1043{
1044 struct perf_counter_context *ctx = counter->ctx;
1045 struct perf_data *oldirqdata = counter->irqdata;
1046 struct task_struct *task = ctx->task;
1047
1048 if (!task) {
1049 smp_call_function_single(counter->cpu,
1050 __perf_switch_irq_data,
1051 counter, 1);
1052 return counter->usrdata;
1053 }
1054
1055retry:
1056 spin_lock_irq(&ctx->lock);
1057 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1058 counter->irqdata = counter->usrdata;
1059 counter->usrdata = oldirqdata;
1060 spin_unlock_irq(&ctx->lock);
1061 return oldirqdata;
1062 }
1063 spin_unlock_irq(&ctx->lock);
1064 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1065 /* Might have failed, because task was scheduled out */
1066 if (counter->irqdata == oldirqdata)
1067 goto retry;
1068
1069 return counter->usrdata;
1070}
1071
1072static void put_context(struct perf_counter_context *ctx)
1073{
1074 if (ctx->task)
1075 put_task_struct(ctx->task);
1076}
1077
1078static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1079{
1080 struct perf_cpu_context *cpuctx;
1081 struct perf_counter_context *ctx;
1082 struct task_struct *task;
1083
1084 /*
1085 * If cpu is not a wildcard then this is a percpu counter:
1086 */
1087 if (cpu != -1) {
1088 /* Must be root to operate on a CPU counter: */
1089 if (!capable(CAP_SYS_ADMIN))
1090 return ERR_PTR(-EACCES);
1091
1092 if (cpu < 0 || cpu > num_possible_cpus())
1093 return ERR_PTR(-EINVAL);
1094
1095 /*
1096 * We could be clever and allow to attach a counter to an
1097 * offline CPU and activate it when the CPU comes up, but
1098 * that's for later.
1099 */
1100 if (!cpu_isset(cpu, cpu_online_map))
1101 return ERR_PTR(-ENODEV);
1102
1103 cpuctx = &per_cpu(perf_cpu_context, cpu);
1104 ctx = &cpuctx->ctx;
1105
1106 return ctx;
1107 }
1108
1109 rcu_read_lock();
1110 if (!pid)
1111 task = current;
1112 else
1113 task = find_task_by_vpid(pid);
1114 if (task)
1115 get_task_struct(task);
1116 rcu_read_unlock();
1117
1118 if (!task)
1119 return ERR_PTR(-ESRCH);
1120
1121 ctx = &task->perf_counter_ctx;
1122 ctx->task = task;
1123
1124 /* Reuse ptrace permission checks for now. */
1125 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1126 put_context(ctx);
1127 return ERR_PTR(-EACCES);
1128 }
1129
1130 return ctx;
1131}
1132
1133/*
1134 * Called when the last reference to the file is gone.
1135 */
1136static int perf_release(struct inode *inode, struct file *file)
1137{
1138 struct perf_counter *counter = file->private_data;
1139 struct perf_counter_context *ctx = counter->ctx;
1140
1141 file->private_data = NULL;
1142
1143 mutex_lock(&ctx->mutex);
1144 mutex_lock(&counter->mutex);
1145
1146 perf_counter_remove_from_context(counter);
1147 put_context(ctx);
1148
1149 mutex_unlock(&counter->mutex);
1150 mutex_unlock(&ctx->mutex);
1151
1152 kfree(counter);
1153
1154 return 0;
1155}
1156
1157/*
1158 * Read the performance counter - simple non blocking version for now
1159 */
1160static ssize_t
1161perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1162{
1163 u64 cntval;
1164
1165 if (count != sizeof(cntval))
1166 return -EINVAL;
1167
1168 /*
1169 * Return end-of-file for a read on a counter that is in
1170 * error state (i.e. because it was pinned but it couldn't be
1171 * scheduled on to the CPU at some point).
1172 */
1173 if (counter->state == PERF_COUNTER_STATE_ERROR)
1174 return 0;
1175
1176 mutex_lock(&counter->mutex);
1177 cntval = perf_counter_read(counter);
1178 mutex_unlock(&counter->mutex);
1179
1180 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1181}
1182
1183static ssize_t
1184perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1185{
1186 if (!usrdata->len)
1187 return 0;
1188
1189 count = min(count, (size_t)usrdata->len);
1190 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1191 return -EFAULT;
1192
1193 /* Adjust the counters */
1194 usrdata->len -= count;
1195 if (!usrdata->len)
1196 usrdata->rd_idx = 0;
1197 else
1198 usrdata->rd_idx += count;
1199
1200 return count;
1201}
1202
1203static ssize_t
1204perf_read_irq_data(struct perf_counter *counter,
1205 char __user *buf,
1206 size_t count,
1207 int nonblocking)
1208{
1209 struct perf_data *irqdata, *usrdata;
1210 DECLARE_WAITQUEUE(wait, current);
1211 ssize_t res, res2;
1212
1213 irqdata = counter->irqdata;
1214 usrdata = counter->usrdata;
1215
1216 if (usrdata->len + irqdata->len >= count)
1217 goto read_pending;
1218
1219 if (nonblocking)
1220 return -EAGAIN;
1221
1222 spin_lock_irq(&counter->waitq.lock);
1223 __add_wait_queue(&counter->waitq, &wait);
1224 for (;;) {
1225 set_current_state(TASK_INTERRUPTIBLE);
1226 if (usrdata->len + irqdata->len >= count)
1227 break;
1228
1229 if (signal_pending(current))
1230 break;
1231
1232 if (counter->state == PERF_COUNTER_STATE_ERROR)
1233 break;
1234
1235 spin_unlock_irq(&counter->waitq.lock);
1236 schedule();
1237 spin_lock_irq(&counter->waitq.lock);
1238 }
1239 __remove_wait_queue(&counter->waitq, &wait);
1240 __set_current_state(TASK_RUNNING);
1241 spin_unlock_irq(&counter->waitq.lock);
1242
1243 if (usrdata->len + irqdata->len < count &&
1244 counter->state != PERF_COUNTER_STATE_ERROR)
1245 return -ERESTARTSYS;
1246read_pending:
1247 mutex_lock(&counter->mutex);
1248
1249 /* Drain pending data first: */
1250 res = perf_copy_usrdata(usrdata, buf, count);
1251 if (res < 0 || res == count)
1252 goto out;
1253
1254 /* Switch irq buffer: */
1255 usrdata = perf_switch_irq_data(counter);
1256 res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1257 if (res2 < 0) {
1258 if (!res)
1259 res = -EFAULT;
1260 } else {
1261 res += res2;
1262 }
1263out:
1264 mutex_unlock(&counter->mutex);
1265
1266 return res;
1267}
1268
1269static ssize_t
1270perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1271{
1272 struct perf_counter *counter = file->private_data;
1273
1274 switch (counter->hw_event.record_type) {
1275 case PERF_RECORD_SIMPLE:
1276 return perf_read_hw(counter, buf, count);
1277
1278 case PERF_RECORD_IRQ:
1279 case PERF_RECORD_GROUP:
1280 return perf_read_irq_data(counter, buf, count,
1281 file->f_flags & O_NONBLOCK);
1282 }
1283 return -EINVAL;
1284}
1285
1286static unsigned int perf_poll(struct file *file, poll_table *wait)
1287{
1288 struct perf_counter *counter = file->private_data;
1289 unsigned int events = 0;
1290 unsigned long flags;
1291
1292 poll_wait(file, &counter->waitq, wait);
1293
1294 spin_lock_irqsave(&counter->waitq.lock, flags);
1295 if (counter->usrdata->len || counter->irqdata->len)
1296 events |= POLLIN;
1297 spin_unlock_irqrestore(&counter->waitq.lock, flags);
1298
1299 return events;
1300}
1301
1302static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1303{
1304 struct perf_counter *counter = file->private_data;
1305 int err = 0;
1306
1307 switch (cmd) {
1308 case PERF_COUNTER_IOC_ENABLE:
1309 perf_counter_enable_family(counter);
1310 break;
1311 case PERF_COUNTER_IOC_DISABLE:
1312 perf_counter_disable_family(counter);
1313 break;
1314 default:
1315 err = -ENOTTY;
1316 }
1317 return err;
1318}
1319
1320static const struct file_operations perf_fops = {
1321 .release = perf_release,
1322 .read = perf_read,
1323 .poll = perf_poll,
1324 .unlocked_ioctl = perf_ioctl,
1325 .compat_ioctl = perf_ioctl,
1326};
1327
1328static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1329{
1330 int cpu = raw_smp_processor_id();
1331
1332 atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1333 return 0;
1334}
1335
1336static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1337{
1338 int cpu = raw_smp_processor_id();
1339 s64 prev;
1340 u64 now;
1341
1342 now = cpu_clock(cpu);
1343 prev = atomic64_read(&counter->hw.prev_count);
1344 atomic64_set(&counter->hw.prev_count, now);
1345 atomic64_add(now - prev, &counter->count);
1346}
1347
1348static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1349{
1350 cpu_clock_perf_counter_update(counter);
1351}
1352
1353static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1354{
1355 cpu_clock_perf_counter_update(counter);
1356}
1357
1358static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1359 .enable = cpu_clock_perf_counter_enable,
1360 .disable = cpu_clock_perf_counter_disable,
1361 .read = cpu_clock_perf_counter_read,
1362};
1363
1364/*
1365 * Called from within the scheduler:
1366 */
1367static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1368{
1369 struct task_struct *curr = counter->task;
1370 u64 delta;
1371
1372 delta = __task_delta_exec(curr, update);
1373
1374 return curr->se.sum_exec_runtime + delta;
1375}
1376
1377static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1378{
1379 u64 prev;
1380 s64 delta;
1381
1382 prev = atomic64_read(&counter->hw.prev_count);
1383
1384 atomic64_set(&counter->hw.prev_count, now);
1385
1386 delta = now - prev;
1387
1388 atomic64_add(delta, &counter->count);
1389}
1390
1391static void task_clock_perf_counter_read(struct perf_counter *counter)
1392{
1393 u64 now = task_clock_perf_counter_val(counter, 1);
1394
1395 task_clock_perf_counter_update(counter, now);
1396}
1397
1398static int task_clock_perf_counter_enable(struct perf_counter *counter)
1399{
1400 u64 now = task_clock_perf_counter_val(counter, 0);
1401
1402 atomic64_set(&counter->hw.prev_count, now);
1403
1404 return 0;
1405}
1406
1407static void task_clock_perf_counter_disable(struct perf_counter *counter)
1408{
1409 u64 now = task_clock_perf_counter_val(counter, 0);
1410
1411 task_clock_perf_counter_update(counter, now);
1412}
1413
1414static const struct hw_perf_counter_ops perf_ops_task_clock = {
1415 .enable = task_clock_perf_counter_enable,
1416 .disable = task_clock_perf_counter_disable,
1417 .read = task_clock_perf_counter_read,
1418};
1419
1420static u64 get_page_faults(void)
1421{
1422 struct task_struct *curr = current;
1423
1424 return curr->maj_flt + curr->min_flt;
1425}
1426
1427static void page_faults_perf_counter_update(struct perf_counter *counter)
1428{
1429 u64 prev, now;
1430 s64 delta;
1431
1432 prev = atomic64_read(&counter->hw.prev_count);
1433 now = get_page_faults();
1434
1435 atomic64_set(&counter->hw.prev_count, now);
1436
1437 delta = now - prev;
1438
1439 atomic64_add(delta, &counter->count);
1440}
1441
1442static void page_faults_perf_counter_read(struct perf_counter *counter)
1443{
1444 page_faults_perf_counter_update(counter);
1445}
1446
1447static int page_faults_perf_counter_enable(struct perf_counter *counter)
1448{
1449 /*
1450 * page-faults is a per-task value already,
1451 * so we dont have to clear it on switch-in.
1452 */
1453
1454 return 0;
1455}
1456
1457static void page_faults_perf_counter_disable(struct perf_counter *counter)
1458{
1459 page_faults_perf_counter_update(counter);
1460}
1461
1462static const struct hw_perf_counter_ops perf_ops_page_faults = {
1463 .enable = page_faults_perf_counter_enable,
1464 .disable = page_faults_perf_counter_disable,
1465 .read = page_faults_perf_counter_read,
1466};
1467
1468static u64 get_context_switches(void)
1469{
1470 struct task_struct *curr = current;
1471
1472 return curr->nvcsw + curr->nivcsw;
1473}
1474
1475static void context_switches_perf_counter_update(struct perf_counter *counter)
1476{
1477 u64 prev, now;
1478 s64 delta;
1479
1480 prev = atomic64_read(&counter->hw.prev_count);
1481 now = get_context_switches();
1482
1483 atomic64_set(&counter->hw.prev_count, now);
1484
1485 delta = now - prev;
1486
1487 atomic64_add(delta, &counter->count);
1488}
1489
1490static void context_switches_perf_counter_read(struct perf_counter *counter)
1491{
1492 context_switches_perf_counter_update(counter);
1493}
1494
1495static int context_switches_perf_counter_enable(struct perf_counter *counter)
1496{
1497 /*
1498 * ->nvcsw + curr->nivcsw is a per-task value already,
1499 * so we dont have to clear it on switch-in.
1500 */
1501
1502 return 0;
1503}
1504
1505static void context_switches_perf_counter_disable(struct perf_counter *counter)
1506{
1507 context_switches_perf_counter_update(counter);
1508}
1509
1510static const struct hw_perf_counter_ops perf_ops_context_switches = {
1511 .enable = context_switches_perf_counter_enable,
1512 .disable = context_switches_perf_counter_disable,
1513 .read = context_switches_perf_counter_read,
1514};
1515
1516static inline u64 get_cpu_migrations(void)
1517{
1518 return current->se.nr_migrations;
1519}
1520
1521static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1522{
1523 u64 prev, now;
1524 s64 delta;
1525
1526 prev = atomic64_read(&counter->hw.prev_count);
1527 now = get_cpu_migrations();
1528
1529 atomic64_set(&counter->hw.prev_count, now);
1530
1531 delta = now - prev;
1532
1533 atomic64_add(delta, &counter->count);
1534}
1535
1536static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1537{
1538 cpu_migrations_perf_counter_update(counter);
1539}
1540
1541static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1542{
1543 /*
1544 * se.nr_migrations is a per-task value already,
1545 * so we dont have to clear it on switch-in.
1546 */
1547
1548 return 0;
1549}
1550
1551static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1552{
1553 cpu_migrations_perf_counter_update(counter);
1554}
1555
1556static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1557 .enable = cpu_migrations_perf_counter_enable,
1558 .disable = cpu_migrations_perf_counter_disable,
1559 .read = cpu_migrations_perf_counter_read,
1560};
1561
1562static const struct hw_perf_counter_ops *
1563sw_perf_counter_init(struct perf_counter *counter)
1564{
1565 const struct hw_perf_counter_ops *hw_ops = NULL;
1566
1567 switch (counter->hw_event.type) {
1568 case PERF_COUNT_CPU_CLOCK:
1569 hw_ops = &perf_ops_cpu_clock;
1570 break;
1571 case PERF_COUNT_TASK_CLOCK:
1572 hw_ops = &perf_ops_task_clock;
1573 break;
1574 case PERF_COUNT_PAGE_FAULTS:
1575 hw_ops = &perf_ops_page_faults;
1576 break;
1577 case PERF_COUNT_CONTEXT_SWITCHES:
1578 hw_ops = &perf_ops_context_switches;
1579 break;
1580 case PERF_COUNT_CPU_MIGRATIONS:
1581 hw_ops = &perf_ops_cpu_migrations;
1582 break;
1583 default:
1584 break;
1585 }
1586 return hw_ops;
1587}
1588
1589/*
1590 * Allocate and initialize a counter structure
1591 */
1592static struct perf_counter *
1593perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1594 int cpu,
1595 struct perf_counter *group_leader,
1596 gfp_t gfpflags)
1597{
1598 const struct hw_perf_counter_ops *hw_ops;
1599 struct perf_counter *counter;
1600
1601 counter = kzalloc(sizeof(*counter), gfpflags);
1602 if (!counter)
1603 return NULL;
1604
1605 /*
1606 * Single counters are their own group leaders, with an
1607 * empty sibling list:
1608 */
1609 if (!group_leader)
1610 group_leader = counter;
1611
1612 mutex_init(&counter->mutex);
1613 INIT_LIST_HEAD(&counter->list_entry);
1614 INIT_LIST_HEAD(&counter->sibling_list);
1615 init_waitqueue_head(&counter->waitq);
1616
1617 INIT_LIST_HEAD(&counter->child_list);
1618
1619 counter->irqdata = &counter->data[0];
1620 counter->usrdata = &counter->data[1];
1621 counter->cpu = cpu;
1622 counter->hw_event = *hw_event;
1623 counter->wakeup_pending = 0;
1624 counter->group_leader = group_leader;
1625 counter->hw_ops = NULL;
1626
1627 counter->state = PERF_COUNTER_STATE_INACTIVE;
1628 if (hw_event->disabled)
1629 counter->state = PERF_COUNTER_STATE_OFF;
1630
1631 hw_ops = NULL;
1632 if (!hw_event->raw && hw_event->type < 0)
1633 hw_ops = sw_perf_counter_init(counter);
1634 if (!hw_ops)
1635 hw_ops = hw_perf_counter_init(counter);
1636
1637 if (!hw_ops) {
1638 kfree(counter);
1639 return NULL;
1640 }
1641 counter->hw_ops = hw_ops;
1642
1643 return counter;
1644}
1645
1646/**
1647 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1648 *
1649 * @hw_event_uptr: event type attributes for monitoring/sampling
1650 * @pid: target pid
1651 * @cpu: target cpu
1652 * @group_fd: group leader counter fd
1653 */
1654asmlinkage int
1655sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1656 pid_t pid, int cpu, int group_fd)
1657{
1658 struct perf_counter *counter, *group_leader;
1659 struct perf_counter_hw_event hw_event;
1660 struct perf_counter_context *ctx;
1661 struct file *counter_file = NULL;
1662 struct file *group_file = NULL;
1663 int fput_needed = 0;
1664 int fput_needed2 = 0;
1665 int ret;
1666
1667 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1668 return -EFAULT;
1669
1670 /*
1671 * Get the target context (task or percpu):
1672 */
1673 ctx = find_get_context(pid, cpu);
1674 if (IS_ERR(ctx))
1675 return PTR_ERR(ctx);
1676
1677 /*
1678 * Look up the group leader (we will attach this counter to it):
1679 */
1680 group_leader = NULL;
1681 if (group_fd != -1) {
1682 ret = -EINVAL;
1683 group_file = fget_light(group_fd, &fput_needed);
1684 if (!group_file)
1685 goto err_put_context;
1686 if (group_file->f_op != &perf_fops)
1687 goto err_put_context;
1688
1689 group_leader = group_file->private_data;
1690 /*
1691 * Do not allow a recursive hierarchy (this new sibling
1692 * becoming part of another group-sibling):
1693 */
1694 if (group_leader->group_leader != group_leader)
1695 goto err_put_context;
1696 /*
1697 * Do not allow to attach to a group in a different
1698 * task or CPU context:
1699 */
1700 if (group_leader->ctx != ctx)
1701 goto err_put_context;
1702 /*
1703 * Only a group leader can be exclusive or pinned
1704 */
1705 if (hw_event.exclusive || hw_event.pinned)
1706 goto err_put_context;
1707 }
1708
1709 ret = -EINVAL;
1710 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1711 if (!counter)
1712 goto err_put_context;
1713
1714 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1715 if (ret < 0)
1716 goto err_free_put_context;
1717
1718 counter_file = fget_light(ret, &fput_needed2);
1719 if (!counter_file)
1720 goto err_free_put_context;
1721
1722 counter->filp = counter_file;
1723 mutex_lock(&ctx->mutex);
1724 perf_install_in_context(ctx, counter, cpu);
1725 mutex_unlock(&ctx->mutex);
1726
1727 fput_light(counter_file, fput_needed2);
1728
1729out_fput:
1730 fput_light(group_file, fput_needed);
1731
1732 return ret;
1733
1734err_free_put_context:
1735 kfree(counter);
1736
1737err_put_context:
1738 put_context(ctx);
1739
1740 goto out_fput;
1741}
1742
1743/*
1744 * Initialize the perf_counter context in a task_struct:
1745 */
1746static void
1747__perf_counter_init_context(struct perf_counter_context *ctx,
1748 struct task_struct *task)
1749{
1750 memset(ctx, 0, sizeof(*ctx));
1751 spin_lock_init(&ctx->lock);
1752 mutex_init(&ctx->mutex);
1753 INIT_LIST_HEAD(&ctx->counter_list);
1754 ctx->task = task;
1755}
1756
1757/*
1758 * inherit a counter from parent task to child task:
1759 */
1760static struct perf_counter *
1761inherit_counter(struct perf_counter *parent_counter,
1762 struct task_struct *parent,
1763 struct perf_counter_context *parent_ctx,
1764 struct task_struct *child,
1765 struct perf_counter *group_leader,
1766 struct perf_counter_context *child_ctx)
1767{
1768 struct perf_counter *child_counter;
1769
1770 /*
1771 * Instead of creating recursive hierarchies of counters,
1772 * we link inherited counters back to the original parent,
1773 * which has a filp for sure, which we use as the reference
1774 * count:
1775 */
1776 if (parent_counter->parent)
1777 parent_counter = parent_counter->parent;
1778
1779 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1780 parent_counter->cpu, group_leader,
1781 GFP_KERNEL);
1782 if (!child_counter)
1783 return NULL;
1784
1785 /*
1786 * Link it up in the child's context:
1787 */
1788 child_counter->ctx = child_ctx;
1789 child_counter->task = child;
1790 list_add_counter(child_counter, child_ctx);
1791 child_ctx->nr_counters++;
1792
1793 child_counter->parent = parent_counter;
1794 /*
1795 * inherit into child's child as well:
1796 */
1797 child_counter->hw_event.inherit = 1;
1798
1799 /*
1800 * Get a reference to the parent filp - we will fput it
1801 * when the child counter exits. This is safe to do because
1802 * we are in the parent and we know that the filp still
1803 * exists and has a nonzero count:
1804 */
1805 atomic_long_inc(&parent_counter->filp->f_count);
1806
1807 /*
1808 * Link this into the parent counter's child list
1809 */
1810 mutex_lock(&parent_counter->mutex);
1811 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1812
1813 /*
1814 * Make the child state follow the state of the parent counter,
1815 * not its hw_event.disabled bit. We hold the parent's mutex,
1816 * so we won't race with perf_counter_{en,dis}able_family.
1817 */
1818 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1819 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1820 else
1821 child_counter->state = PERF_COUNTER_STATE_OFF;
1822
1823 mutex_unlock(&parent_counter->mutex);
1824
1825 return child_counter;
1826}
1827
1828static int inherit_group(struct perf_counter *parent_counter,
1829 struct task_struct *parent,
1830 struct perf_counter_context *parent_ctx,
1831 struct task_struct *child,
1832 struct perf_counter_context *child_ctx)
1833{
1834 struct perf_counter *leader;
1835 struct perf_counter *sub;
1836
1837 leader = inherit_counter(parent_counter, parent, parent_ctx,
1838 child, NULL, child_ctx);
1839 if (!leader)
1840 return -ENOMEM;
1841 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1842 if (!inherit_counter(sub, parent, parent_ctx,
1843 child, leader, child_ctx))
1844 return -ENOMEM;
1845 }
1846 return 0;
1847}
1848
1849static void sync_child_counter(struct perf_counter *child_counter,
1850 struct perf_counter *parent_counter)
1851{
1852 u64 parent_val, child_val;
1853
1854 parent_val = atomic64_read(&parent_counter->count);
1855 child_val = atomic64_read(&child_counter->count);
1856
1857 /*
1858 * Add back the child's count to the parent's count:
1859 */
1860 atomic64_add(child_val, &parent_counter->count);
1861
1862 /*
1863 * Remove this counter from the parent's list
1864 */
1865 mutex_lock(&parent_counter->mutex);
1866 list_del_init(&child_counter->child_list);
1867 mutex_unlock(&parent_counter->mutex);
1868
1869 /*
1870 * Release the parent counter, if this was the last
1871 * reference to it.
1872 */
1873 fput(parent_counter->filp);
1874}
1875
1876static void
1877__perf_counter_exit_task(struct task_struct *child,
1878 struct perf_counter *child_counter,
1879 struct perf_counter_context *child_ctx)
1880{
1881 struct perf_counter *parent_counter;
1882 struct perf_counter *sub, *tmp;
1883
1884 /*
1885 * If we do not self-reap then we have to wait for the
1886 * child task to unschedule (it will happen for sure),
1887 * so that its counter is at its final count. (This
1888 * condition triggers rarely - child tasks usually get
1889 * off their CPU before the parent has a chance to
1890 * get this far into the reaping action)
1891 */
1892 if (child != current) {
1893 wait_task_inactive(child, 0);
1894 list_del_init(&child_counter->list_entry);
1895 } else {
1896 struct perf_cpu_context *cpuctx;
1897 unsigned long flags;
1898 u64 perf_flags;
1899
1900 /*
1901 * Disable and unlink this counter.
1902 *
1903 * Be careful about zapping the list - IRQ/NMI context
1904 * could still be processing it:
1905 */
1906 curr_rq_lock_irq_save(&flags);
1907 perf_flags = hw_perf_save_disable();
1908
1909 cpuctx = &__get_cpu_var(perf_cpu_context);
1910
1911 group_sched_out(child_counter, cpuctx, child_ctx);
1912
1913 list_del_init(&child_counter->list_entry);
1914
1915 child_ctx->nr_counters--;
1916
1917 hw_perf_restore(perf_flags);
1918 curr_rq_unlock_irq_restore(&flags);
1919 }
1920
1921 parent_counter = child_counter->parent;
1922 /*
1923 * It can happen that parent exits first, and has counters
1924 * that are still around due to the child reference. These
1925 * counters need to be zapped - but otherwise linger.
1926 */
1927 if (parent_counter) {
1928 sync_child_counter(child_counter, parent_counter);
1929 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1930 list_entry) {
1931 if (sub->parent)
1932 sync_child_counter(sub, sub->parent);
1933 kfree(sub);
1934 }
1935 }
1936
1937 kfree(child_counter);
1938}
1939
1940/*
1941 * When a child task exits, feed back counter values to parent counters.
1942 *
1943 * Note: we may be running in child context, but the PID is not hashed
1944 * anymore so new counters will not be added.
1945 */
1946void perf_counter_exit_task(struct task_struct *child)
1947{
1948 struct perf_counter *child_counter, *tmp;
1949 struct perf_counter_context *child_ctx;
1950
1951 child_ctx = &child->perf_counter_ctx;
1952
1953 if (likely(!child_ctx->nr_counters))
1954 return;
1955
1956 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1957 list_entry)
1958 __perf_counter_exit_task(child, child_counter, child_ctx);
1959}
1960
1961/*
1962 * Initialize the perf_counter context in task_struct
1963 */
1964void perf_counter_init_task(struct task_struct *child)
1965{
1966 struct perf_counter_context *child_ctx, *parent_ctx;
1967 struct perf_counter *counter;
1968 struct task_struct *parent = current;
1969
1970 child_ctx = &child->perf_counter_ctx;
1971 parent_ctx = &parent->perf_counter_ctx;
1972
1973 __perf_counter_init_context(child_ctx, child);
1974
1975 /*
1976 * This is executed from the parent task context, so inherit
1977 * counters that have been marked for cloning:
1978 */
1979
1980 if (likely(!parent_ctx->nr_counters))
1981 return;
1982
1983 /*
1984 * Lock the parent list. No need to lock the child - not PID
1985 * hashed yet and not running, so nobody can access it.
1986 */
1987 mutex_lock(&parent_ctx->mutex);
1988
1989 /*
1990 * We dont have to disable NMIs - we are only looking at
1991 * the list, not manipulating it:
1992 */
1993 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1994 if (!counter->hw_event.inherit)
1995 continue;
1996
1997 if (inherit_group(counter, parent,
1998 parent_ctx, child, child_ctx))
1999 break;
2000 }
2001
2002 mutex_unlock(&parent_ctx->mutex);
2003}
2004
2005static void __cpuinit perf_counter_init_cpu(int cpu)
2006{
2007 struct perf_cpu_context *cpuctx;
2008
2009 cpuctx = &per_cpu(perf_cpu_context, cpu);
2010 __perf_counter_init_context(&cpuctx->ctx, NULL);
2011
2012 mutex_lock(&perf_resource_mutex);
2013 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2014 mutex_unlock(&perf_resource_mutex);
2015
2016 hw_perf_counter_setup(cpu);
2017}
2018
2019#ifdef CONFIG_HOTPLUG_CPU
2020static void __perf_counter_exit_cpu(void *info)
2021{
2022 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2023 struct perf_counter_context *ctx = &cpuctx->ctx;
2024 struct perf_counter *counter, *tmp;
2025
2026 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2027 __perf_counter_remove_from_context(counter);
2028}
2029static void perf_counter_exit_cpu(int cpu)
2030{
2031 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2032 struct perf_counter_context *ctx = &cpuctx->ctx;
2033
2034 mutex_lock(&ctx->mutex);
2035 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2036 mutex_unlock(&ctx->mutex);
2037}
2038#else
2039static inline void perf_counter_exit_cpu(int cpu) { }
2040#endif
2041
2042static int __cpuinit
2043perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2044{
2045 unsigned int cpu = (long)hcpu;
2046
2047 switch (action) {
2048
2049 case CPU_UP_PREPARE:
2050 case CPU_UP_PREPARE_FROZEN:
2051 perf_counter_init_cpu(cpu);
2052 break;
2053
2054 case CPU_DOWN_PREPARE:
2055 case CPU_DOWN_PREPARE_FROZEN:
2056 perf_counter_exit_cpu(cpu);
2057 break;
2058
2059 default:
2060 break;
2061 }
2062
2063 return NOTIFY_OK;
2064}
2065
2066static struct notifier_block __cpuinitdata perf_cpu_nb = {
2067 .notifier_call = perf_cpu_notify,
2068};
2069
2070static int __init perf_counter_init(void)
2071{
2072 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2073 (void *)(long)smp_processor_id());
2074 register_cpu_notifier(&perf_cpu_nb);
2075
2076 return 0;
2077}
2078early_initcall(perf_counter_init);
2079
2080static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2081{
2082 return sprintf(buf, "%d\n", perf_reserved_percpu);
2083}
2084
2085static ssize_t
2086perf_set_reserve_percpu(struct sysdev_class *class,
2087 const char *buf,
2088 size_t count)
2089{
2090 struct perf_cpu_context *cpuctx;
2091 unsigned long val;
2092 int err, cpu, mpt;
2093
2094 err = strict_strtoul(buf, 10, &val);
2095 if (err)
2096 return err;
2097 if (val > perf_max_counters)
2098 return -EINVAL;
2099
2100 mutex_lock(&perf_resource_mutex);
2101 perf_reserved_percpu = val;
2102 for_each_online_cpu(cpu) {
2103 cpuctx = &per_cpu(perf_cpu_context, cpu);
2104 spin_lock_irq(&cpuctx->ctx.lock);
2105 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2106 perf_max_counters - perf_reserved_percpu);
2107 cpuctx->max_pertask = mpt;
2108 spin_unlock_irq(&cpuctx->ctx.lock);
2109 }
2110 mutex_unlock(&perf_resource_mutex);
2111
2112 return count;
2113}
2114
2115static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2116{
2117 return sprintf(buf, "%d\n", perf_overcommit);
2118}
2119
2120static ssize_t
2121perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2122{
2123 unsigned long val;
2124 int err;
2125
2126 err = strict_strtoul(buf, 10, &val);
2127 if (err)
2128 return err;
2129 if (val > 1)
2130 return -EINVAL;
2131
2132 mutex_lock(&perf_resource_mutex);
2133 perf_overcommit = val;
2134 mutex_unlock(&perf_resource_mutex);
2135
2136 return count;
2137}
2138
2139static SYSDEV_CLASS_ATTR(
2140 reserve_percpu,
2141 0644,
2142 perf_show_reserve_percpu,
2143 perf_set_reserve_percpu
2144 );
2145
2146static SYSDEV_CLASS_ATTR(
2147 overcommit,
2148 0644,
2149 perf_show_overcommit,
2150 perf_set_overcommit
2151 );
2152
2153static struct attribute *perfclass_attrs[] = {
2154 &attr_reserve_percpu.attr,
2155 &attr_overcommit.attr,
2156 NULL
2157};
2158
2159static struct attribute_group perfclass_attr_group = {
2160 .attrs = perfclass_attrs,
2161 .name = "perf_counters",
2162};
2163
2164static int __init perf_counter_sysfs_init(void)
2165{
2166 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2167 &perfclass_attr_group);
2168}
2169device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..ce9fecab5f02 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,7 +668,7 @@ static inline int cpu_of(struct rq *rq)
668#define task_rq(p) cpu_rq(task_cpu(p)) 668#define task_rq(p) cpu_rq(task_cpu(p))
669#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 669#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
670 670
671static inline void update_rq_clock(struct rq *rq) 671inline void update_rq_clock(struct rq *rq)
672{ 672{
673 rq->clock = sched_clock_cpu(cpu_of(rq)); 673 rq->clock = sched_clock_cpu(cpu_of(rq));
674} 674}
@@ -979,6 +979,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
979 } 979 }
980} 980}
981 981
982void curr_rq_lock_irq_save(unsigned long *flags)
983 __acquires(rq->lock)
984{
985 struct rq *rq;
986
987 local_irq_save(*flags);
988 rq = cpu_rq(smp_processor_id());
989 spin_lock(&rq->lock);
990}
991
992void curr_rq_unlock_irq_restore(unsigned long *flags)
993 __releases(rq->lock)
994{
995 struct rq *rq;
996
997 rq = cpu_rq(smp_processor_id());
998 spin_unlock(&rq->lock);
999 local_irq_restore(*flags);
1000}
1001
982void task_rq_unlock_wait(struct task_struct *p) 1002void task_rq_unlock_wait(struct task_struct *p)
983{ 1003{
984 struct rq *rq = task_rq(p); 1004 struct rq *rq = task_rq(p);
@@ -1885,12 +1905,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1885 p->se.sleep_start -= clock_offset; 1905 p->se.sleep_start -= clock_offset;
1886 if (p->se.block_start) 1906 if (p->se.block_start)
1887 p->se.block_start -= clock_offset; 1907 p->se.block_start -= clock_offset;
1908#endif
1888 if (old_cpu != new_cpu) { 1909 if (old_cpu != new_cpu) {
1889 schedstat_inc(p, se.nr_migrations); 1910 p->se.nr_migrations++;
1911#ifdef CONFIG_SCHEDSTATS
1890 if (task_hot(p, old_rq->clock, NULL)) 1912 if (task_hot(p, old_rq->clock, NULL))
1891 schedstat_inc(p, se.nr_forced2_migrations); 1913 schedstat_inc(p, se.nr_forced2_migrations);
1892 }
1893#endif 1914#endif
1915 }
1894 p->se.vruntime -= old_cfsrq->min_vruntime - 1916 p->se.vruntime -= old_cfsrq->min_vruntime -
1895 new_cfsrq->min_vruntime; 1917 new_cfsrq->min_vruntime;
1896 1918
@@ -2242,6 +2264,27 @@ static int sched_balance_self(int cpu, int flag)
2242 2264
2243#endif /* CONFIG_SMP */ 2265#endif /* CONFIG_SMP */
2244 2266
2267/**
2268 * task_oncpu_function_call - call a function on the cpu on which a task runs
2269 * @p: the task to evaluate
2270 * @func: the function to be called
2271 * @info: the function call argument
2272 *
2273 * Calls the function @func when the task is currently running. This might
2274 * be on the current CPU, which just calls the function directly
2275 */
2276void task_oncpu_function_call(struct task_struct *p,
2277 void (*func) (void *info), void *info)
2278{
2279 int cpu;
2280
2281 preempt_disable();
2282 cpu = task_cpu(p);
2283 if (task_curr(p))
2284 smp_call_function_single(cpu, func, info, 1);
2285 preempt_enable();
2286}
2287
2245/*** 2288/***
2246 * try_to_wake_up - wake up a thread 2289 * try_to_wake_up - wake up a thread
2247 * @p: the to-be-woken-up thread 2290 * @p: the to-be-woken-up thread
@@ -2384,6 +2427,7 @@ static void __sched_fork(struct task_struct *p)
2384 p->se.exec_start = 0; 2427 p->se.exec_start = 0;
2385 p->se.sum_exec_runtime = 0; 2428 p->se.sum_exec_runtime = 0;
2386 p->se.prev_sum_exec_runtime = 0; 2429 p->se.prev_sum_exec_runtime = 0;
2430 p->se.nr_migrations = 0;
2387 p->se.last_wakeup = 0; 2431 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2432 p->se.avg_overlap = 0;
2389 2433
@@ -2604,6 +2648,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2604 */ 2648 */
2605 prev_state = prev->state; 2649 prev_state = prev->state;
2606 finish_arch_switch(prev); 2650 finish_arch_switch(prev);
2651 perf_counter_task_sched_in(current, cpu_of(rq));
2607 finish_lock_switch(rq, prev); 2652 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2653#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2654 if (current->sched_class->post_schedule)
@@ -4132,6 +4177,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
4132 * Return any ns on the sched_clock that have not yet been banked in 4177 * Return any ns on the sched_clock that have not yet been banked in
4133 * @p in case that task is currently running. 4178 * @p in case that task is currently running.
4134 */ 4179 */
4180unsigned long long __task_delta_exec(struct task_struct *p, int update)
4181{
4182 s64 delta_exec;
4183 struct rq *rq;
4184
4185 rq = task_rq(p);
4186 WARN_ON_ONCE(!runqueue_is_locked());
4187 WARN_ON_ONCE(!task_current(rq, p));
4188
4189 if (update)
4190 update_rq_clock(rq);
4191
4192 delta_exec = rq->clock - p->se.exec_start;
4193
4194 WARN_ON_ONCE(delta_exec < 0);
4195
4196 return delta_exec;
4197}
4198
4199/*
4200 * Return any ns on the sched_clock that have not yet been banked in
4201 * @p in case that task is currently running.
4202 */
4135unsigned long long task_delta_exec(struct task_struct *p) 4203unsigned long long task_delta_exec(struct task_struct *p)
4136{ 4204{
4137 unsigned long flags; 4205 unsigned long flags;
@@ -4391,6 +4459,7 @@ void scheduler_tick(void)
4391 update_rq_clock(rq); 4459 update_rq_clock(rq);
4392 update_cpu_load(rq); 4460 update_cpu_load(rq);
4393 curr->sched_class->task_tick(rq, curr, 0); 4461 curr->sched_class->task_tick(rq, curr, 0);
4462 perf_counter_task_tick(curr, cpu);
4394 spin_unlock(&rq->lock); 4463 spin_unlock(&rq->lock);
4395 4464
4396#ifdef CONFIG_SMP 4465#ifdef CONFIG_SMP
@@ -4586,6 +4655,7 @@ need_resched_nonpreemptible:
4586 4655
4587 if (likely(prev != next)) { 4656 if (likely(prev != next)) {
4588 sched_info_switch(prev, next); 4657 sched_info_switch(prev, next);
4658 perf_counter_task_sched_out(prev, cpu);
4589 4659
4590 rq->nr_switches++; 4660 rq->nr_switches++;
4591 rq->curr = next; 4661 rq->curr = next;
diff --git a/kernel/sys.c b/kernel/sys.c
index 763c3c17ded3..c2a951ae4223 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1797,6 +1798,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1797 case PR_SET_TSC: 1798 case PR_SET_TSC:
1798 error = SET_TSC_CTL(arg2); 1799 error = SET_TSC_CTL(arg2);
1799 break; 1800 break;
1801 case PR_TASK_PERF_COUNTERS_DISABLE:
1802 error = perf_counter_task_disable();
1803 break;
1804 case PR_TASK_PERF_COUNTERS_ENABLE:
1805 error = perf_counter_task_enable();
1806 break;
1800 case PR_GET_TIMERSLACK: 1807 case PR_GET_TIMERSLACK:
1801 error = current->timer_slack_ns; 1808 error = current->timer_slack_ns;
1802 break; 1809 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a23281707..4be8bbc7577c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
174cond_syscall(compat_sys_timerfd_gettime); 174cond_syscall(compat_sys_timerfd_gettime);
175cond_syscall(sys_eventfd); 175cond_syscall(sys_eventfd);
176cond_syscall(sys_eventfd2); 176cond_syscall(sys_eventfd2);
177
178/* performance counters: */
179cond_syscall(sys_perf_counter_open);