aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf-counters.txt147
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h218
-rw-r--r--arch/x86/include/asm/hardirq_32.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h5
-rw-r--r--arch/x86/include/asm/pda.h1
-rw-r--r--arch/x86/include/asm/perf_counter.h95
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c695
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/oprofile/op_model_ppro.c2
-rw-r--r--drivers/acpi/processor_idle.c8
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c8
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/perf_counter.h257
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--init/Kconfig30
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/perf_counter.c1686
-rw-r--r--kernel/sched.c76
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
43 files changed, 3342 insertions, 50 deletions
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 000000000000..fddd32189a50
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,147 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those.
15
16Performance counters are accessed via special file descriptors.
17There's one file descriptor per virtual counter used.
18
19The special file descriptor is opened via the perf_counter_open()
20system call:
21
22 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
23 pid_t pid, int cpu, int group_fd);
24
25The syscall returns the new fd. The fd can be used via the normal
26VFS system calls: read() can be used to read the counter, fcntl()
27can be used to set the blocking mode, etc.
28
29Multiple counters can be kept open at a time, and the counters
30can be poll()ed.
31
32When creating a new counter fd, 'perf_counter_hw_event' is:
33
34/*
35 * Hardware event to monitor via a performance monitoring counter:
36 */
37struct perf_counter_hw_event {
38 s64 type;
39
40 u64 irq_period;
41 u32 record_type;
42
43 u32 disabled : 1, /* off by default */
44 nmi : 1, /* NMI sampling */
45 raw : 1, /* raw event type */
46 __reserved_1 : 29;
47
48 u64 __reserved_2;
49};
50
51/*
52 * Generalized performance counter event types, used by the hw_event.type
53 * parameter of the sys_perf_counter_open() syscall:
54 */
55enum hw_event_types {
56 /*
57 * Common hardware events, generalized by the kernel:
58 */
59 PERF_COUNT_CYCLES = 0,
60 PERF_COUNT_INSTRUCTIONS = 1,
61 PERF_COUNT_CACHE_REFERENCES = 2,
62 PERF_COUNT_CACHE_MISSES = 3,
63 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
64 PERF_COUNT_BRANCH_MISSES = 5,
65
66 /*
67 * Special "software" counters provided by the kernel, even if
68 * the hardware does not support performance counters. These
69 * counters measure various physical and sw events of the
70 * kernel (and allow the profiling of them as well):
71 */
72 PERF_COUNT_CPU_CLOCK = -1,
73 PERF_COUNT_TASK_CLOCK = -2,
74 /*
75 * Future software events:
76 */
77 /* PERF_COUNT_PAGE_FAULTS = -3,
78 PERF_COUNT_CONTEXT_SWITCHES = -4, */
79};
80
81These are standardized types of events that work uniformly on all CPUs
82that implements Performance Counters support under Linux. If a CPU is
83not able to count branch-misses, then the system call will return
84-EINVAL.
85
86More hw_event_types are supported as well, but they are CPU
87specific and are enumerated via /sys on a per CPU basis. Raw hw event
88types can be passed in under hw_event.type if hw_event.raw is 1.
89For example, to count "External bus cycles while bus lock signal asserted"
90events on Intel Core CPUs, pass in a 0x4064 event type value and set
91hw_event.raw to 1.
92
93'record_type' is the type of data that a read() will provide for the
94counter, and it can be one of:
95
96/*
97 * IRQ-notification data record type:
98 */
99enum perf_counter_record_type {
100 PERF_RECORD_SIMPLE = 0,
101 PERF_RECORD_IRQ = 1,
102 PERF_RECORD_GROUP = 2,
103};
104
105a "simple" counter is one that counts hardware events and allows
106them to be read out into a u64 count value. (read() returns 8 on
107a successful read of a simple counter.)
108
109An "irq" counter is one that will also provide an IRQ context information:
110the IP of the interrupted context. In this case read() will return
111the 8-byte counter value, plus the Instruction Pointer address of the
112interrupted context.
113
114The parameter 'hw_event_period' is the number of events before waking up
115a read() that is blocked on a counter fd. Zero value means a non-blocking
116counter.
117
118The 'pid' parameter allows the counter to be specific to a task:
119
120 pid == 0: if the pid parameter is zero, the counter is attached to the
121 current task.
122
123 pid > 0: the counter is attached to a specific task (if the current task
124 has sufficient privilege to do so)
125
126 pid < 0: all tasks are counted (per cpu counters)
127
128The 'cpu' parameter allows a counter to be made specific to a full
129CPU:
130
131 cpu >= 0: the counter is restricted to a specific CPU
132 cpu == -1: the counter counts on all CPUs
133
134(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
135
136A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
137events of that task and 'follows' that task to whatever CPU the task
138gets schedule to. Per task counters can be created by any user, for
139their own tasks.
140
141A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
142all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
143
144Group counters are created by passing in a group_fd of another counter.
145Groups are scheduled at once and can be used with PERF_RECORD_GROUP
146to record multi-dimensional timestamps.
147
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..1f4844505765 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -685,6 +685,7 @@ config X86_UP_IOAPIC
685config X86_LOCAL_APIC 685config X86_LOCAL_APIC
686 def_bool y 686 def_bool y
687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
688 select HAVE_PERF_COUNTERS if (!M386 && !M486)
688 689
689config X86_IO_APIC 690config X86_IO_APIC
690 def_bool y 691 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..3c14ed07dc4e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
823 .quad compat_sys_signalfd4 823 .quad compat_sys_signalfd4
824 .quad sys_eventfd2 824 .quad sys_eventfd2
825 .quad sys_epoll_create1 825 .quad sys_epoll_create1
826 .quad sys_dup3 /* 330 */ 826 .quad sys_dup3 /* 330 */
827 .quad sys_pipe2 827 .quad sys_pipe2
828 .quad sys_inotify_init1 828 .quad sys_inotify_init1
829 .quad sys_perf_counter_open
829ia32_syscall_end: 830ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..977250ed8b89 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_set - set atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 *
298 * Atomically sets the value of @ptr to @new_val.
299 */
300static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
301{
302 unsigned long long old_val;
303
304 do {
305 old_val = atomic_read(ptr);
306 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
307}
308
309/**
310 * atomic64_read - read atomic64 variable
311 * @ptr: pointer to type atomic64_t
312 *
313 * Atomically reads the value of @ptr and returns it.
314 */
315static inline unsigned long long atomic64_read(atomic64_t *ptr)
316{
317 unsigned long long curr_val;
318
319 do {
320 curr_val = __atomic64_read(ptr);
321 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
322
323 return curr_val;
324}
325
326/**
327 * atomic64_add_return - add and return
328 * @delta: integer value to add
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically adds @delta to @ptr and returns @delta + *@ptr
332 */
333static inline unsigned long long
334atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
335{
336 unsigned long long old_val, new_val;
337
338 do {
339 old_val = atomic_read(ptr);
340 new_val = old_val + delta;
341
342 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
343
344 return new_val;
345}
346
347static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
348{
349 return atomic64_add_return(-delta, ptr);
350}
351
352static inline long atomic64_inc_return(atomic64_t *ptr)
353{
354 return atomic64_add_return(1, ptr);
355}
356
357static inline long atomic64_dec_return(atomic64_t *ptr)
358{
359 return atomic64_sub_return(1, ptr);
360}
361
362/**
363 * atomic64_add - add integer to atomic64 variable
364 * @delta: integer value to add
365 * @ptr: pointer to type atomic64_t
366 *
367 * Atomically adds @delta to @ptr.
368 */
369static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
370{
371 atomic64_add_return(delta, ptr);
372}
373
374/**
375 * atomic64_sub - subtract the atomic64 variable
376 * @delta: integer value to subtract
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically subtracts @delta from @ptr.
380 */
381static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
382{
383 atomic64_add(-delta, ptr);
384}
385
386/**
387 * atomic64_sub_and_test - subtract value from variable and test result
388 * @delta: integer value to subtract
389 * @ptr: pointer to type atomic64_t
390 *
391 * Atomically subtracts @delta from @ptr and returns
392 * true if the result is zero, or false for all
393 * other cases.
394 */
395static inline int
396atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
397{
398 unsigned long long old_val = atomic64_sub_return(delta, ptr);
399
400 return old_val == 0;
401}
402
403/**
404 * atomic64_inc - increment atomic64 variable
405 * @ptr: pointer to type atomic64_t
406 *
407 * Atomically increments @ptr by 1.
408 */
409static inline void atomic64_inc(atomic64_t *ptr)
410{
411 atomic64_add(1, ptr);
412}
413
414/**
415 * atomic64_dec - decrement atomic64 variable
416 * @ptr: pointer to type atomic64_t
417 *
418 * Atomically decrements @ptr by 1.
419 */
420static inline void atomic64_dec(atomic64_t *ptr)
421{
422 atomic64_sub(1, ptr);
423}
424
425/**
426 * atomic64_dec_and_test - decrement and test
427 * @ptr: pointer to type atomic64_t
428 *
429 * Atomically decrements @ptr by 1 and
430 * returns true if the result is 0, or false for all other
431 * cases.
432 */
433static inline int atomic64_dec_and_test(atomic64_t *ptr)
434{
435 return atomic64_sub_and_test(1, ptr);
436}
437
438/**
439 * atomic64_inc_and_test - increment and test
440 * @ptr: pointer to type atomic64_t
441 *
442 * Atomically increments @ptr by 1
443 * and returns true if the result is zero, or false for all
444 * other cases.
445 */
446static inline int atomic64_inc_and_test(atomic64_t *ptr)
447{
448 return atomic64_sub_and_test(-1, ptr);
449}
450
451/**
452 * atomic64_add_negative - add and test if negative
453 * @delta: integer value to add
454 * @ptr: pointer to type atomic64_t
455 *
456 * Atomically adds @delta to @ptr and returns true
457 * if the result is negative, or false when
458 * result is greater than or equal to zero.
459 */
460static inline int
461atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
462{
463 long long old_val = atomic64_add_return(delta, ptr);
464
465 return old_val < 0;
466}
467
250#include <asm-generic/atomic.h> 468#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 469#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index cf7954d1405f..7a07897a7888 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
9 unsigned long idle_timestamp; 9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */ 10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */ 11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int apic_perf_irqs; /* arch dependent */
12 unsigned int irq0_irqs; 13 unsigned int irq0_irqs;
13 unsigned int irq_resched_count; 14 unsigned int irq_resched_count;
14 unsigned int irq_call_count; 15 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..aa93e53b85ee 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
30/* Interrupt handlers registered during init_IRQ */ 30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void); 31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void); 32extern void error_interrupt(void);
33extern void perf_counter_interrupt(void);
34
33extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
34extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..21a0b92027f5 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
87#define LOCAL_TIMER_VECTOR 0xef 87#define LOCAL_TIMER_VECTOR 0xef
88 88
89/* 89/*
90 * Performance monitoring interrupt vector:
91 */
92#define LOCAL_PERF_VECTOR 0xee
93
94/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we 95 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority 96 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector) 97 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..ad31e5d90e90 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
25 * a much simpler SMP time architecture: 25 * a much simpler SMP time architecture:
26 */ 26 */
27#ifdef CONFIG_X86_LOCAL_APIC 27#ifdef CONFIG_X86_LOCAL_APIC
28
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) 29BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 30BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 31BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31 32
33#ifdef CONFIG_PERF_COUNTERS
34BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
35#endif
36
32#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 38BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif 39#endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df37..90a8d9d4206b 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
30 short isidle; 30 short isidle;
31 struct mm_struct *active_mm; 31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs; 32 unsigned apic_timer_irqs;
33 unsigned apic_perf_irqs;
33 unsigned irq0_irqs; 34 unsigned irq0_irqs;
34 unsigned irq_resched_count; 35 unsigned irq_resched_count;
35 unsigned irq_call_count; 36 unsigned irq_call_count;
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..2e08ed736647
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(int nmi);
90#else
91static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(int nmi) { }
93#endif
94
95#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 98789647baa9..efdf93820aed 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
83#define TIF_SECCOMP 8 /* secure computing */ 83#define TIF_SECCOMP 8 /* secure computing */
84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
85#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
85#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 86#define TIF_NOTSC 16 /* TSC is not accessible in userland */
86#define TIF_IA32 17 /* 32bit process */ 87#define TIF_IA32 17 /* 32bit process */
87#define TIF_FORK 18 /* ret_from_fork */ 88#define TIF_FORK 18 /* ret_from_fork */
@@ -104,6 +105,7 @@ struct thread_info {
104#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 105#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
105#define _TIF_SECCOMP (1 << TIF_SECCOMP) 106#define _TIF_SECCOMP (1 << TIF_SECCOMP)
106#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 107#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
108#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
107#define _TIF_NOTSC (1 << TIF_NOTSC) 109#define _TIF_NOTSC (1 << TIF_NOTSC)
108#define _TIF_IA32 (1 << TIF_IA32) 110#define _TIF_IA32 (1 << TIF_IA32)
109#define _TIF_FORK (1 << TIF_FORK) 111#define _TIF_FORK (1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
135 137
136/* Only used for 64 bit */ 138/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \ 139#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 140 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
139 141
140/* flags to check in __switch_to() */ 142/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 143#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..7e47658b0a6f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
338#define __NR_dup3 330 338#define __NR_dup3 330
339#define __NR_pipe2 331 339#define __NR_pipe2 331
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_perf_counter_open 333
341 342
342#ifdef __KERNEL__ 343#ifdef __KERNEL__
343 344
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..53025feaf88d 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
653__SYSCALL(__NR_pipe2, sys_pipe2) 653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294 654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1) 655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656 656#define __NR_perf_counter_open 295
657__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
657 658
658#ifndef __NO_STUBS 659#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR 660#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 566a08466b19..d2d17b8d10f8 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -35,6 +35,7 @@
35#include <linux/nmi.h> 35#include <linux/nmi.h>
36#include <linux/timex.h> 36#include <linux/timex.h>
37 37
38#include <asm/perf_counter.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mtrr.h> 40#include <asm/mtrr.h>
40#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -1134,6 +1135,7 @@ void __cpuinit setup_local_APIC(void)
1134 apic_write(APIC_ESR, 0); 1135 apic_write(APIC_ESR, 0);
1135 } 1136 }
1136#endif 1137#endif
1138 perf_counters_lapic_init(0);
1137 1139
1138 preempt_disable(); 1140 preempt_disable();
1139 1141
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..c3813306e0b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -22,11 +22,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 24
25obj-$(CONFIG_X86_MCE) += mcheck/ 25obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
26obj-$(CONFIG_MTRR) += mtrr/
27obj-$(CONFIG_CPU_FREQ) += cpufreq/
28 26
29obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 27obj-$(CONFIG_X86_MCE) += mcheck/
28obj-$(CONFIG_MTRR) += mtrr/
29obj-$(CONFIG_CPU_FREQ) += cpufreq/
30
31obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
30 32
31quiet_cmd_mkcapflags = MKCAP $@ 33quiet_cmd_mkcapflags = MKCAP $@
32 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 34 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b1..667e5d561ed7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/mtrr.h> 18#include <asm/mtrr.h>
19#include <asm/mce.h> 19#include <asm/mce.h>
20#include <asm/perf_counter.h>
20#include <asm/pat.h> 21#include <asm/pat.h>
21#include <asm/asm.h> 22#include <asm/asm.h>
22#include <asm/numa.h> 23#include <asm/numa.h>
@@ -772,6 +773,7 @@ void __init identify_boot_cpu(void)
772#else 773#else
773 vgetcpu_set_mode(); 774 vgetcpu_set_mode();
774#endif 775#endif
776 init_hw_perf_counters();
775} 777}
776 778
777void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 779void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..9376771f757b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,695 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/perf_counter.h>
11#include <linux/capability.h>
12#include <linux/notifier.h>
13#include <linux/hardirq.h>
14#include <linux/kprobes.h>
15#include <linux/module.h>
16#include <linux/kdebug.h>
17#include <linux/sched.h>
18
19#include <asm/perf_counter.h>
20#include <asm/apic.h>
21
22static bool perf_counters_initialized __read_mostly;
23
24/*
25 * Number of (generic) HW counters:
26 */
27static int nr_counters_generic __read_mostly;
28static u64 perf_counter_mask __read_mostly;
29static u64 counter_value_mask __read_mostly;
30
31static int nr_counters_fixed __read_mostly;
32
33struct cpu_hw_counters {
34 struct perf_counter *counters[X86_PMC_IDX_MAX];
35 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
36};
37
38/*
39 * Intel PerfMon v3. Used on Core2 and later.
40 */
41static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
42
43static const int intel_perfmon_event_map[] =
44{
45 [PERF_COUNT_CPU_CYCLES] = 0x003c,
46 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
47 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
48 [PERF_COUNT_CACHE_MISSES] = 0x412e,
49 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
50 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
51 [PERF_COUNT_BUS_CYCLES] = 0x013c,
52};
53
54static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
55
56/*
57 * Propagate counter elapsed time into the generic counter.
58 * Can only be executed on the CPU where the counter is active.
59 * Returns the delta events processed.
60 */
61static void
62x86_perf_counter_update(struct perf_counter *counter,
63 struct hw_perf_counter *hwc, int idx)
64{
65 u64 prev_raw_count, new_raw_count, delta;
66
67 /*
68 * Careful: an NMI might modify the previous counter value.
69 *
70 * Our tactic to handle this is to first atomically read and
71 * exchange a new raw count - then add that new-prev delta
72 * count to the generic counter atomically:
73 */
74again:
75 prev_raw_count = atomic64_read(&hwc->prev_count);
76 rdmsrl(hwc->counter_base + idx, new_raw_count);
77
78 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
79 new_raw_count) != prev_raw_count)
80 goto again;
81
82 /*
83 * Now we have the new raw value and have updated the prev
84 * timestamp already. We can now calculate the elapsed delta
85 * (counter-)time and add that to the generic counter.
86 *
87 * Careful, not all hw sign-extends above the physical width
88 * of the count, so we do that by clipping the delta to 32 bits:
89 */
90 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
91
92 atomic64_add(delta, &counter->count);
93 atomic64_sub(delta, &hwc->period_left);
94}
95
96/*
97 * Setup the hardware configuration for a given hw_event_type
98 */
99static int __hw_perf_counter_init(struct perf_counter *counter)
100{
101 struct perf_counter_hw_event *hw_event = &counter->hw_event;
102 struct hw_perf_counter *hwc = &counter->hw;
103
104 if (unlikely(!perf_counters_initialized))
105 return -EINVAL;
106
107 /*
108 * Count user events, and generate PMC IRQs:
109 * (keep 'enabled' bit clear for now)
110 */
111 hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
112
113 /*
114 * If privileged enough, count OS events too, and allow
115 * NMI events as well:
116 */
117 hwc->nmi = 0;
118 if (capable(CAP_SYS_ADMIN)) {
119 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
120 if (hw_event->nmi)
121 hwc->nmi = 1;
122 }
123
124 hwc->irq_period = hw_event->irq_period;
125 /*
126 * Intel PMCs cannot be accessed sanely above 32 bit width,
127 * so we install an artificial 1<<31 period regardless of
128 * the generic counter period:
129 */
130 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
131 hwc->irq_period = 0x7FFFFFFF;
132
133 atomic64_set(&hwc->period_left, hwc->irq_period);
134
135 /*
136 * Raw event type provide the config in the event structure
137 */
138 if (hw_event->raw) {
139 hwc->config |= hw_event->type;
140 } else {
141 if (hw_event->type >= max_intel_perfmon_events)
142 return -EINVAL;
143 /*
144 * The generic map:
145 */
146 hwc->config |= intel_perfmon_event_map[hw_event->type];
147 }
148 counter->wakeup_pending = 0;
149
150 return 0;
151}
152
153u64 hw_perf_save_disable(void)
154{
155 u64 ctrl;
156
157 if (unlikely(!perf_counters_initialized))
158 return 0;
159
160 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
161 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
162
163 return ctrl;
164}
165EXPORT_SYMBOL_GPL(hw_perf_save_disable);
166
167void hw_perf_restore(u64 ctrl)
168{
169 if (unlikely(!perf_counters_initialized))
170 return;
171
172 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
173}
174EXPORT_SYMBOL_GPL(hw_perf_restore);
175
176static inline void
177__pmc_fixed_disable(struct perf_counter *counter,
178 struct hw_perf_counter *hwc, unsigned int __idx)
179{
180 int idx = __idx - X86_PMC_IDX_FIXED;
181 u64 ctrl_val, mask;
182 int err;
183
184 mask = 0xfULL << (idx * 4);
185
186 rdmsrl(hwc->config_base, ctrl_val);
187 ctrl_val &= ~mask;
188 err = checking_wrmsrl(hwc->config_base, ctrl_val);
189}
190
191static inline void
192__pmc_generic_disable(struct perf_counter *counter,
193 struct hw_perf_counter *hwc, unsigned int idx)
194{
195 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
196 __pmc_fixed_disable(counter, hwc, idx);
197 else
198 wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
199}
200
201static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
202
203/*
204 * Set the next IRQ period, based on the hwc->period_left value.
205 * To be called with the counter disabled in hw:
206 */
207static void
208__hw_perf_counter_set_period(struct perf_counter *counter,
209 struct hw_perf_counter *hwc, int idx)
210{
211 s64 left = atomic64_read(&hwc->period_left);
212 s32 period = hwc->irq_period;
213 int err;
214
215 /*
216 * If we are way outside a reasoable range then just skip forward:
217 */
218 if (unlikely(left <= -period)) {
219 left = period;
220 atomic64_set(&hwc->period_left, left);
221 }
222
223 if (unlikely(left <= 0)) {
224 left += period;
225 atomic64_set(&hwc->period_left, left);
226 }
227
228 per_cpu(prev_left[idx], smp_processor_id()) = left;
229
230 /*
231 * The hw counter starts counting from this counter offset,
232 * mark it to be able to extra future deltas:
233 */
234 atomic64_set(&hwc->prev_count, (u64)-left);
235
236 err = checking_wrmsrl(hwc->counter_base + idx,
237 (u64)(-left) & counter_value_mask);
238}
239
240static inline void
241__pmc_fixed_enable(struct perf_counter *counter,
242 struct hw_perf_counter *hwc, unsigned int __idx)
243{
244 int idx = __idx - X86_PMC_IDX_FIXED;
245 u64 ctrl_val, bits, mask;
246 int err;
247
248 /*
249 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
250 * and enable ring-0 counting if allowed:
251 */
252 bits = 0x8ULL | 0x2ULL;
253 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
254 bits |= 0x1;
255 bits <<= (idx * 4);
256 mask = 0xfULL << (idx * 4);
257
258 rdmsrl(hwc->config_base, ctrl_val);
259 ctrl_val &= ~mask;
260 ctrl_val |= bits;
261 err = checking_wrmsrl(hwc->config_base, ctrl_val);
262}
263
264static void
265__pmc_generic_enable(struct perf_counter *counter,
266 struct hw_perf_counter *hwc, int idx)
267{
268 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
269 __pmc_fixed_enable(counter, hwc, idx);
270 else
271 wrmsr(hwc->config_base + idx,
272 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
273}
274
275static int
276fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
277{
278 unsigned int event;
279
280 if (unlikely(hwc->nmi))
281 return -1;
282
283 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
284
285 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
286 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
287 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
288 return X86_PMC_IDX_FIXED_CPU_CYCLES;
289 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
290 return X86_PMC_IDX_FIXED_BUS_CYCLES;
291
292 return -1;
293}
294
295/*
296 * Find a PMC slot for the freshly enabled / scheduled in counter:
297 */
298static int pmc_generic_enable(struct perf_counter *counter)
299{
300 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
301 struct hw_perf_counter *hwc = &counter->hw;
302 int idx;
303
304 idx = fixed_mode_idx(counter, hwc);
305 if (idx >= 0) {
306 /*
307 * Try to get the fixed counter, if that is already taken
308 * then try to get a generic counter:
309 */
310 if (test_and_set_bit(idx, cpuc->used))
311 goto try_generic;
312
313 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
314 /*
315 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
316 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
317 */
318 hwc->counter_base =
319 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
320 hwc->idx = idx;
321 } else {
322 idx = hwc->idx;
323 /* Try to get the previous generic counter again */
324 if (test_and_set_bit(idx, cpuc->used)) {
325try_generic:
326 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
327 if (idx == nr_counters_generic)
328 return -EAGAIN;
329
330 set_bit(idx, cpuc->used);
331 hwc->idx = idx;
332 }
333 hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
334 hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
335 }
336
337 perf_counters_lapic_init(hwc->nmi);
338
339 __pmc_generic_disable(counter, hwc, idx);
340
341 cpuc->counters[idx] = counter;
342 /*
343 * Make it visible before enabling the hw:
344 */
345 smp_wmb();
346
347 __hw_perf_counter_set_period(counter, hwc, idx);
348 __pmc_generic_enable(counter, hwc, idx);
349
350 return 0;
351}
352
353void perf_counter_print_debug(void)
354{
355 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
356 struct cpu_hw_counters *cpuc;
357 int cpu, idx;
358
359 if (!nr_counters_generic)
360 return;
361
362 local_irq_disable();
363
364 cpu = smp_processor_id();
365 cpuc = &per_cpu(cpu_hw_counters, cpu);
366
367 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
368 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
369 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
370 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
371
372 printk(KERN_INFO "\n");
373 printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
374 printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
375 printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
376 printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed);
377 printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
378
379 for (idx = 0; idx < nr_counters_generic; idx++) {
380 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
381 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
382
383 prev_left = per_cpu(prev_left[idx], cpu);
384
385 printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n",
386 cpu, idx, pmc_ctrl);
387 printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n",
388 cpu, idx, pmc_count);
389 printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n",
390 cpu, idx, prev_left);
391 }
392 for (idx = 0; idx < nr_counters_fixed; idx++) {
393 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
394
395 printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
396 cpu, idx, pmc_count);
397 }
398 local_irq_enable();
399}
400
401static void pmc_generic_disable(struct perf_counter *counter)
402{
403 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
404 struct hw_perf_counter *hwc = &counter->hw;
405 unsigned int idx = hwc->idx;
406
407 __pmc_generic_disable(counter, hwc, idx);
408
409 clear_bit(idx, cpuc->used);
410 cpuc->counters[idx] = NULL;
411 /*
412 * Make sure the cleared pointer becomes visible before we
413 * (potentially) free the counter:
414 */
415 smp_wmb();
416
417 /*
418 * Drain the remaining delta count out of a counter
419 * that we are disabling:
420 */
421 x86_perf_counter_update(counter, hwc, idx);
422}
423
424static void perf_store_irq_data(struct perf_counter *counter, u64 data)
425{
426 struct perf_data *irqdata = counter->irqdata;
427
428 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
429 irqdata->overrun++;
430 } else {
431 u64 *p = (u64 *) &irqdata->data[irqdata->len];
432
433 *p = data;
434 irqdata->len += sizeof(u64);
435 }
436}
437
438/*
439 * Save and restart an expired counter. Called by NMI contexts,
440 * so it has to be careful about preempting normal counter ops:
441 */
442static void perf_save_and_restart(struct perf_counter *counter)
443{
444 struct hw_perf_counter *hwc = &counter->hw;
445 int idx = hwc->idx;
446
447 x86_perf_counter_update(counter, hwc, idx);
448 __hw_perf_counter_set_period(counter, hwc, idx);
449
450 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
451 __pmc_generic_enable(counter, hwc, idx);
452}
453
454static void
455perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
456{
457 struct perf_counter *counter, *group_leader = sibling->group_leader;
458
459 /*
460 * Store sibling timestamps (if any):
461 */
462 list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
463
464 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
465 perf_store_irq_data(sibling, counter->hw_event.type);
466 perf_store_irq_data(sibling, atomic64_read(&counter->count));
467 }
468}
469
470/*
471 * This handler is triggered by the local APIC, so the APIC IRQ handling
472 * rules apply:
473 */
474static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
475{
476 int bit, cpu = smp_processor_id();
477 u64 ack, status, saved_global;
478 struct cpu_hw_counters *cpuc;
479
480 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
481
482 /* Disable counters globally */
483 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
484 ack_APIC_irq();
485
486 cpuc = &per_cpu(cpu_hw_counters, cpu);
487
488 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
489 if (!status)
490 goto out;
491
492again:
493 ack = status;
494 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
495 struct perf_counter *counter = cpuc->counters[bit];
496
497 clear_bit(bit, (unsigned long *) &status);
498 if (!counter)
499 continue;
500
501 perf_save_and_restart(counter);
502
503 switch (counter->hw_event.record_type) {
504 case PERF_RECORD_SIMPLE:
505 continue;
506 case PERF_RECORD_IRQ:
507 perf_store_irq_data(counter, instruction_pointer(regs));
508 break;
509 case PERF_RECORD_GROUP:
510 perf_handle_group(counter, &status, &ack);
511 break;
512 }
513 /*
514 * From NMI context we cannot call into the scheduler to
515 * do a task wakeup - but we mark these generic as
516 * wakeup_pending and initate a wakeup callback:
517 */
518 if (nmi) {
519 counter->wakeup_pending = 1;
520 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
521 } else {
522 wake_up(&counter->waitq);
523 }
524 }
525
526 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
527
528 /*
529 * Repeat if there is more work to be done:
530 */
531 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
532 if (status)
533 goto again;
534out:
535 /*
536 * Restore - do not reenable when global enable is off:
537 */
538 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
539}
540
541void smp_perf_counter_interrupt(struct pt_regs *regs)
542{
543 irq_enter();
544 inc_irq_stat(apic_perf_irqs);
545 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
546 __smp_perf_counter_interrupt(regs, 0);
547
548 irq_exit();
549}
550
551/*
552 * This handler is triggered by NMI contexts:
553 */
554void perf_counter_notify(struct pt_regs *regs)
555{
556 struct cpu_hw_counters *cpuc;
557 unsigned long flags;
558 int bit, cpu;
559
560 local_irq_save(flags);
561 cpu = smp_processor_id();
562 cpuc = &per_cpu(cpu_hw_counters, cpu);
563
564 for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
565 struct perf_counter *counter = cpuc->counters[bit];
566
567 if (!counter)
568 continue;
569
570 if (counter->wakeup_pending) {
571 counter->wakeup_pending = 0;
572 wake_up(&counter->waitq);
573 }
574 }
575
576 local_irq_restore(flags);
577}
578
579void __cpuinit perf_counters_lapic_init(int nmi)
580{
581 u32 apic_val;
582
583 if (!perf_counters_initialized)
584 return;
585 /*
586 * Enable the performance counter vector in the APIC LVT:
587 */
588 apic_val = apic_read(APIC_LVTERR);
589
590 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
591 if (nmi)
592 apic_write(APIC_LVTPC, APIC_DM_NMI);
593 else
594 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
595 apic_write(APIC_LVTERR, apic_val);
596}
597
598static int __kprobes
599perf_counter_nmi_handler(struct notifier_block *self,
600 unsigned long cmd, void *__args)
601{
602 struct die_args *args = __args;
603 struct pt_regs *regs;
604
605 if (likely(cmd != DIE_NMI_IPI))
606 return NOTIFY_DONE;
607
608 regs = args->regs;
609
610 apic_write(APIC_LVTPC, APIC_DM_NMI);
611 __smp_perf_counter_interrupt(regs, 1);
612
613 return NOTIFY_STOP;
614}
615
616static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
617 .notifier_call = perf_counter_nmi_handler
618};
619
620void __init init_hw_perf_counters(void)
621{
622 union cpuid10_eax eax;
623 unsigned int ebx;
624 unsigned int unused;
625 union cpuid10_edx edx;
626
627 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
628 return;
629
630 /*
631 * Check whether the Architectural PerfMon supports
632 * Branch Misses Retired Event or not.
633 */
634 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
635 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
636 return;
637
638 printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
639
640 printk(KERN_INFO "... version: %d\n", eax.split.version_id);
641 printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
642 nr_counters_generic = eax.split.num_counters;
643 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
644 nr_counters_generic = X86_PMC_MAX_GENERIC;
645 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
646 nr_counters_generic, X86_PMC_MAX_GENERIC);
647 }
648 perf_counter_mask = (1 << nr_counters_generic) - 1;
649 perf_max_counters = nr_counters_generic;
650
651 printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
652 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
653 printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask);
654
655 printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
656
657 nr_counters_fixed = edx.split.num_counters_fixed;
658 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
659 nr_counters_fixed = X86_PMC_MAX_FIXED;
660 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
661 nr_counters_fixed, X86_PMC_MAX_FIXED);
662 }
663 printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed);
664
665 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
666
667 printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask);
668 perf_counters_initialized = true;
669
670 perf_counters_lapic_init(0);
671 register_die_notifier(&perf_counter_nmi_notifier);
672}
673
674static void pmc_generic_read(struct perf_counter *counter)
675{
676 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
677}
678
679static const struct hw_perf_counter_ops x86_perf_counter_ops = {
680 .enable = pmc_generic_enable,
681 .disable = pmc_generic_disable,
682 .read = pmc_generic_read,
683};
684
685const struct hw_perf_counter_ops *
686hw_perf_counter_init(struct perf_counter *counter)
687{
688 int err;
689
690 err = __hw_perf_counter_init(counter);
691 if (err)
692 return NULL;
693
694 return &x86_perf_counter_ops;
695}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e28c7a987793..1954a9662203 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1024,6 +1024,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1024apicinterrupt SPURIOUS_APIC_VECTOR \ 1024apicinterrupt SPURIOUS_APIC_VECTOR \
1025 spurious_interrupt smp_spurious_interrupt 1025 spurious_interrupt smp_spurious_interrupt
1026 1026
1027#ifdef CONFIG_PERF_COUNTERS
1028apicinterrupt LOCAL_PERF_VECTOR \
1029 perf_counter_interrupt smp_perf_counter_interrupt
1030#endif
1031
1027/* 1032/*
1028 * Exception entry points. 1033 * Exception entry points.
1029 */ 1034 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..22f650db917f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -57,6 +57,10 @@ static int show_other_interrupts(struct seq_file *p)
57 for_each_online_cpu(j) 57 for_each_online_cpu(j)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 58 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
59 seq_printf(p, " Local timer interrupts\n"); 59 seq_printf(p, " Local timer interrupts\n");
60 seq_printf(p, "CNT: ");
61 for_each_online_cpu(j)
62 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
63 seq_printf(p, " Performance counter interrupts\n");
60#endif 64#endif
61#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
62 seq_printf(p, "RES: "); 66 seq_printf(p, "RES: ");
@@ -164,6 +168,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
164 168
165#ifdef CONFIG_X86_LOCAL_APIC 169#ifdef CONFIG_X86_LOCAL_APIC
166 sum += irq_stats(cpu)->apic_timer_irqs; 170 sum += irq_stats(cpu)->apic_timer_irqs;
171 sum += irq_stats(cpu)->apic_perf_irqs;
167#endif 172#endif
168#ifdef CONFIG_SMP 173#ifdef CONFIG_SMP
169 sum += irq_stats(cpu)->irq_resched_count; 174 sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1507ad4e674d..0bef6280f30c 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -171,6 +171,9 @@ void __init native_init_IRQ(void)
171 /* IPI vectors for APIC spurious and error interrupts */ 171 /* IPI vectors for APIC spurious and error interrupts */
172 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 172 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
173 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 173 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
174# ifdef CONFIG_PERF_COUNTERS
175 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
176# endif
174#endif 177#endif
175 178
176#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 179#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..6a71bfc51e51 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -150,6 +150,11 @@ static void __init apic_intr_init(void)
150 /* IPI vectors for APIC spurious and error interrupts */ 150 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
153
154 /* Performance monitoring interrupt: */
155#ifdef CONFIG_PERF_COUNTERS
156 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
157#endif
153} 158}
154 159
155void __init native_init_IRQ(void) 160void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 89bb7668041d..4fa5243c2069 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9 9#include <linux/perf_counter.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
@@ -886,6 +886,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
886 tracehook_notify_resume(regs); 886 tracehook_notify_resume(regs);
887 } 887 }
888 888
889 if (thread_info_flags & _TIF_PERF_COUNTERS) {
890 clear_thread_flag(TIF_PERF_COUNTERS);
891 perf_counter_notify(regs);
892 }
893
889#ifdef CONFIG_X86_32 894#ifdef CONFIG_X86_32
890 clear_thread_flag(TIF_IRET); 895 clear_thread_flag(TIF_IRET);
891#endif /* CONFIG_X86_32 */ 896#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..496726ddcea1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf3..07c914555a5e 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 66a9d8145562..7acb23f830ce 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -271,8 +271,11 @@ static atomic_t c3_cpu_count;
271/* Common C-state entry for C2, C3, .. */ 271/* Common C-state entry for C2, C3, .. */
272static void acpi_cstate_enter(struct acpi_processor_cx *cstate) 272static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
273{ 273{
274 u64 perf_flags;
275
274 /* Don't trace irqs off for idle */ 276 /* Don't trace irqs off for idle */
275 stop_critical_timings(); 277 stop_critical_timings();
278 perf_flags = hw_perf_save_disable();
276 if (cstate->entry_method == ACPI_CSTATE_FFH) { 279 if (cstate->entry_method == ACPI_CSTATE_FFH) {
277 /* Call into architectural FFH based C-state */ 280 /* Call into architectural FFH based C-state */
278 acpi_processor_ffh_cstate_enter(cstate); 281 acpi_processor_ffh_cstate_enter(cstate);
@@ -285,6 +288,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
285 gets asserted in time to freeze execution properly. */ 288 gets asserted in time to freeze execution properly. */
286 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 289 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
287 } 290 }
291 hw_perf_restore(perf_flags);
288 start_critical_timings(); 292 start_critical_timings();
289} 293}
290#endif /* !CONFIG_CPU_IDLE */ 294#endif /* !CONFIG_CPU_IDLE */
@@ -1426,8 +1430,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1426 */ 1430 */
1427static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 1431static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1428{ 1432{
1433 u64 pctrl;
1434
1429 /* Don't trace irqs off for idle */ 1435 /* Don't trace irqs off for idle */
1430 stop_critical_timings(); 1436 stop_critical_timings();
1437 pctrl = hw_perf_save_disable();
1431 if (cx->entry_method == ACPI_CSTATE_FFH) { 1438 if (cx->entry_method == ACPI_CSTATE_FFH) {
1432 /* Call into architectural FFH based C-state */ 1439 /* Call into architectural FFH based C-state */
1433 acpi_processor_ffh_cstate_enter(cx); 1440 acpi_processor_ffh_cstate_enter(cx);
@@ -1442,6 +1449,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1442 gets asserted in time to freeze execution properly. */ 1449 gets asserted in time to freeze execution properly. */
1443 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 1450 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1444 } 1451 }
1452 hw_perf_restore(pctrl);
1445 start_critical_timings(); 1453 start_critical_timings();
1446} 1454}
1447 1455
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index d41b9f6f7903..5a3eab0882a0 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 71a6efe5d8bd..605be573fe87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -1010,6 +1011,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1010 1011
1011 current->personality &= ~bprm->per_clear; 1012 current->personality &= ~bprm->per_clear;
1012 1013
1014 /*
1015 * Flush performance counters when crossing a
1016 * security domain:
1017 */
1018 if (!get_dumpable(current->mm))
1019 perf_counter_exit_task(current);
1020
1013 /* An exec changes our domain. We are no longer part of the thread 1021 /* An exec changes our domain. We are no longer part of the thread
1014 group */ 1022 group */
1015 1023
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2f3c2d4ef73b..49a40fbc806b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -115,6 +115,16 @@ extern struct group_info init_groups;
115 115
116extern struct cred init_cred; 116extern struct cred init_cred;
117 117
118#ifdef CONFIG_PERF_COUNTERS
119# define INIT_PERF_COUNTERS(tsk) \
120 .perf_counter_ctx.counter_list = \
121 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
122 .perf_counter_ctx.lock = \
123 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
124#else
125# define INIT_PERF_COUNTERS(tsk)
126#endif
127
118/* 128/*
119 * INIT_TASK is used to set up the first task table, touch at 129 * INIT_TASK is used to set up the first task table, touch at
120 * your own risk!. Base=0, limit=0x1fffff (=2MB) 130 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -179,6 +189,7 @@ extern struct cred init_cred;
179 INIT_IDS \ 189 INIT_IDS \
180 INIT_TRACE_IRQFLAGS \ 190 INIT_TRACE_IRQFLAGS \
181 INIT_LOCKDEP \ 191 INIT_LOCKDEP \
192 INIT_PERF_COUNTERS(tsk) \
182} 193}
183 194
184 195
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d20413119..ecfa66817634 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
78 return sum; 78 return sum;
79} 79}
80 80
81
82/*
83 * Lock/unlock the current runqueue - to extract task statistics:
84 */
85extern void curr_rq_lock_irq_save(unsigned long *flags);
86extern void curr_rq_unlock_irq_restore(unsigned long *flags);
87extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
81extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
82extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
83extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
84extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..cc3a75a239a9
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,257 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <asm/atomic.h>
17
18#ifdef CONFIG_PERF_COUNTERS
19# include <asm/perf_counter.h>
20#endif
21
22#include <linux/list.h>
23#include <linux/mutex.h>
24#include <linux/rculist.h>
25#include <linux/rcupdate.h>
26#include <linux/spinlock.h>
27
28struct task_struct;
29
30/*
31 * User-space ABI bits:
32 */
33
34/*
35 * Generalized performance counter event types, used by the hw_event.type
36 * parameter of the sys_perf_counter_open() syscall:
37 */
38enum hw_event_types {
39 /*
40 * Common hardware events, generalized by the kernel:
41 */
42 PERF_COUNT_CPU_CYCLES = 0,
43 PERF_COUNT_INSTRUCTIONS = 1,
44 PERF_COUNT_CACHE_REFERENCES = 2,
45 PERF_COUNT_CACHE_MISSES = 3,
46 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
47 PERF_COUNT_BRANCH_MISSES = 5,
48 PERF_COUNT_BUS_CYCLES = 6,
49
50 PERF_HW_EVENTS_MAX = 7,
51
52 /*
53 * Special "software" counters provided by the kernel, even if
54 * the hardware does not support performance counters. These
55 * counters measure various physical and sw events of the
56 * kernel (and allow the profiling of them as well):
57 */
58 PERF_COUNT_CPU_CLOCK = -1,
59 PERF_COUNT_TASK_CLOCK = -2,
60 PERF_COUNT_PAGE_FAULTS = -3,
61 PERF_COUNT_CONTEXT_SWITCHES = -4,
62 PERF_COUNT_CPU_MIGRATIONS = -5,
63
64 PERF_SW_EVENTS_MIN = -6,
65};
66
67/*
68 * IRQ-notification data record type:
69 */
70enum perf_counter_record_type {
71 PERF_RECORD_SIMPLE = 0,
72 PERF_RECORD_IRQ = 1,
73 PERF_RECORD_GROUP = 2,
74};
75
76/*
77 * Hardware event to monitor via a performance monitoring counter:
78 */
79struct perf_counter_hw_event {
80 s64 type;
81
82 u64 irq_period;
83 u32 record_type;
84
85 u32 disabled : 1, /* off by default */
86 nmi : 1, /* NMI sampling */
87 raw : 1, /* raw event type */
88 inherit : 1, /* children inherit it */
89 __reserved_1 : 28;
90
91 u64 __reserved_2;
92};
93
94/*
95 * Kernel-internal data types:
96 */
97
98/**
99 * struct hw_perf_counter - performance counter hardware details:
100 */
101struct hw_perf_counter {
102#ifdef CONFIG_PERF_COUNTERS
103 u64 config;
104 unsigned long config_base;
105 unsigned long counter_base;
106 int nmi;
107 unsigned int idx;
108 atomic64_t prev_count;
109 u64 irq_period;
110 atomic64_t period_left;
111#endif
112};
113
114/*
115 * Hardcoded buffer length limit for now, for IRQ-fed events:
116 */
117#define PERF_DATA_BUFLEN 2048
118
119/**
120 * struct perf_data - performance counter IRQ data sampling ...
121 */
122struct perf_data {
123 int len;
124 int rd_idx;
125 int overrun;
126 u8 data[PERF_DATA_BUFLEN];
127};
128
129struct perf_counter;
130
131/**
132 * struct hw_perf_counter_ops - performance counter hw ops
133 */
134struct hw_perf_counter_ops {
135 int (*enable) (struct perf_counter *counter);
136 void (*disable) (struct perf_counter *counter);
137 void (*read) (struct perf_counter *counter);
138};
139
140/**
141 * enum perf_counter_active_state - the states of a counter
142 */
143enum perf_counter_active_state {
144 PERF_COUNTER_STATE_OFF = -1,
145 PERF_COUNTER_STATE_INACTIVE = 0,
146 PERF_COUNTER_STATE_ACTIVE = 1,
147};
148
149struct file;
150
151/**
152 * struct perf_counter - performance counter kernel representation:
153 */
154struct perf_counter {
155#ifdef CONFIG_PERF_COUNTERS
156 struct list_head list_entry;
157 struct list_head sibling_list;
158 struct perf_counter *group_leader;
159 const struct hw_perf_counter_ops *hw_ops;
160
161 enum perf_counter_active_state state;
162 atomic64_t count;
163
164 struct perf_counter_hw_event hw_event;
165 struct hw_perf_counter hw;
166
167 struct perf_counter_context *ctx;
168 struct task_struct *task;
169 struct file *filp;
170
171 struct perf_counter *parent;
172 /*
173 * Protect attach/detach:
174 */
175 struct mutex mutex;
176
177 int oncpu;
178 int cpu;
179
180 /* read() / irq related data */
181 wait_queue_head_t waitq;
182 /* optional: for NMIs */
183 int wakeup_pending;
184 struct perf_data *irqdata;
185 struct perf_data *usrdata;
186 struct perf_data data[2];
187#endif
188};
189
190/**
191 * struct perf_counter_context - counter context structure
192 *
193 * Used as a container for task counters and CPU counters as well:
194 */
195struct perf_counter_context {
196#ifdef CONFIG_PERF_COUNTERS
197 /*
198 * Protect the list of counters:
199 */
200 spinlock_t lock;
201
202 struct list_head counter_list;
203 int nr_counters;
204 int nr_active;
205 struct task_struct *task;
206#endif
207};
208
209/**
210 * struct perf_counter_cpu_context - per cpu counter context structure
211 */
212struct perf_cpu_context {
213 struct perf_counter_context ctx;
214 struct perf_counter_context *task_ctx;
215 int active_oncpu;
216 int max_pertask;
217};
218
219/*
220 * Set by architecture code:
221 */
222extern int perf_max_counters;
223
224#ifdef CONFIG_PERF_COUNTERS
225extern const struct hw_perf_counter_ops *
226hw_perf_counter_init(struct perf_counter *counter);
227
228extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
229extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
230extern void perf_counter_task_tick(struct task_struct *task, int cpu);
231extern void perf_counter_init_task(struct task_struct *child);
232extern void perf_counter_exit_task(struct task_struct *child);
233extern void perf_counter_notify(struct pt_regs *regs);
234extern void perf_counter_print_debug(void);
235extern u64 hw_perf_save_disable(void);
236extern void hw_perf_restore(u64 ctrl);
237extern int perf_counter_task_disable(void);
238extern int perf_counter_task_enable(void);
239
240#else
241static inline void
242perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
243static inline void
244perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
245static inline void
246perf_counter_task_tick(struct task_struct *task, int cpu) { }
247static inline void perf_counter_init_task(struct task_struct *child) { }
248static inline void perf_counter_exit_task(struct task_struct *child) { }
249static inline void perf_counter_notify(struct pt_regs *regs) { }
250static inline void perf_counter_print_debug(void) { }
251static inline void hw_perf_restore(u64 ctrl) { }
252static inline u64 hw_perf_save_disable(void) { return 0; }
253static inline int perf_counter_task_disable(void) { return -EINVAL; }
254static inline int perf_counter_task_enable(void) { return -EINVAL; }
255#endif
256
257#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..f134a0f7080a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -1031,6 +1032,8 @@ struct sched_entity {
1031 u64 last_wakeup; 1032 u64 last_wakeup;
1032 u64 avg_overlap; 1033 u64 avg_overlap;
1033 1034
1035 u64 nr_migrations;
1036
1034#ifdef CONFIG_SCHEDSTATS 1037#ifdef CONFIG_SCHEDSTATS
1035 u64 wait_start; 1038 u64 wait_start;
1036 u64 wait_max; 1039 u64 wait_max;
@@ -1046,7 +1049,6 @@ struct sched_entity {
1046 u64 exec_max; 1049 u64 exec_max;
1047 u64 slice_max; 1050 u64 slice_max;
1048 1051
1049 u64 nr_migrations;
1050 u64 nr_migrations_cold; 1052 u64 nr_migrations_cold;
1051 u64 nr_failed_migrations_affine; 1053 u64 nr_failed_migrations_affine;
1052 u64 nr_failed_migrations_running; 1054 u64 nr_failed_migrations_running;
@@ -1349,6 +1351,7 @@ struct task_struct {
1349 struct list_head pi_state_list; 1351 struct list_head pi_state_list;
1350 struct futex_pi_state *pi_state_cache; 1352 struct futex_pi_state *pi_state_cache;
1351#endif 1353#endif
1354 struct perf_counter_context perf_counter_ctx;
1352#ifdef CONFIG_NUMA 1355#ifdef CONFIG_NUMA
1353 struct mempolicy *mempolicy; 1356 struct mempolicy *mempolicy;
1354 short il_next; 1357 short il_next;
@@ -2322,6 +2325,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2322#define TASK_SIZE_OF(tsk) TASK_SIZE 2325#define TASK_SIZE_OF(tsk) TASK_SIZE
2323#endif 2326#endif
2324 2327
2328/*
2329 * Call the function if the target task is executing on a CPU right now:
2330 */
2331extern void task_oncpu_function_call(struct task_struct *p,
2332 void (*func) (void *info), void *info);
2333
2334
2325#ifdef CONFIG_MM_OWNER 2335#ifdef CONFIG_MM_OWNER
2326extern void mm_update_next_owner(struct mm_struct *mm); 2336extern void mm_update_next_owner(struct mm_struct *mm);
2327extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2337extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 18d0a243a7b3..a1d177ce0a08 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
54struct compat_timeval; 54struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct perf_counter_hw_event;
57 58
58#include <linux/types.h> 59#include <linux/types.h>
59#include <linux/aio_abi.h> 60#include <linux/aio_abi.h>
@@ -624,4 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
624 625
625int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 626int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
626 627
628
629asmlinkage int sys_perf_counter_open(
630
631 struct perf_counter_hw_event *hw_event_uptr __user,
632 pid_t pid,
633 int cpu,
634 int group_fd);
627#endif 635#endif
diff --git a/init/Kconfig b/init/Kconfig
index a724a149bf3f..a588cdc274bc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -777,6 +777,36 @@ config AIO
777 by some high performance threaded applications. Disabling 777 by some high performance threaded applications. Disabling
778 this option saves about 7k. 778 this option saves about 7k.
779 779
780config HAVE_PERF_COUNTERS
781 bool
782
783menu "Performance Counters"
784
785config PERF_COUNTERS
786 bool "Kernel Performance Counters"
787 depends on HAVE_PERF_COUNTERS
788 default y
789 select ANON_INODES
790 help
791 Enable kernel support for performance counter hardware.
792
793 Performance counters are special hardware registers available
794 on most modern CPUs. These registers count the number of certain
795 types of hw events: such as instructions executed, cachemisses
796 suffered, or branches mis-predicted - without slowing down the
797 kernel or applications. These registers can also trigger interrupts
798 when a threshold number of events have passed - and can thus be
799 used to profile the code that runs on that CPU.
800
801 The Linux Performance Counter subsystem provides an abstraction of
802 these hardware capabilities, available via a system call. It
803 provides per task and per CPU counters, and it provides event
804 capabilities on top of those.
805
806 Say Y if unsure.
807
808endmenu
809
780config VM_EVENT_COUNTERS 810config VM_EVENT_COUNTERS
781 default y 811 default y
782 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 812 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 2921d90ce32f..8b2628c7914b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
89obj-$(CONFIG_FUNCTION_TRACER) += trace/ 89obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 90obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 91obj-$(CONFIG_SMP) += sched_cpupri.o
92obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
92 93
93ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 94ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 95# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index c7740fa3252c..cbdb39a498eb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,6 +159,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
159{ 159{
160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
161 161
162#ifdef CONFIG_PERF_COUNTERS
163 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
164#endif
162 trace_sched_process_free(tsk); 165 trace_sched_process_free(tsk);
163 put_task_struct(tsk); 166 put_task_struct(tsk);
164} 167}
@@ -1093,10 +1096,6 @@ NORET_TYPE void do_exit(long code)
1093 tsk->mempolicy = NULL; 1096 tsk->mempolicy = NULL;
1094#endif 1097#endif
1095#ifdef CONFIG_FUTEX 1098#ifdef CONFIG_FUTEX
1096 /*
1097 * This must happen late, after the PID is not
1098 * hashed anymore:
1099 */
1100 if (unlikely(!list_empty(&tsk->pi_state_list))) 1099 if (unlikely(!list_empty(&tsk->pi_state_list)))
1101 exit_pi_state_list(tsk); 1100 exit_pi_state_list(tsk);
1102 if (unlikely(current->pi_state_cache)) 1101 if (unlikely(current->pi_state_cache))
@@ -1361,6 +1360,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1361 */ 1360 */
1362 read_unlock(&tasklist_lock); 1361 read_unlock(&tasklist_lock);
1363 1362
1363 /*
1364 * Flush inherited counters to the parent - before the parent
1365 * gets woken up by child-exit notifications.
1366 */
1367 perf_counter_exit_task(p);
1368
1364 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1369 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1365 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1370 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1366 ? p->signal->group_exit_code : p->exit_code; 1371 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..b1f8609287eb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -985,6 +985,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
985 goto fork_out; 985 goto fork_out;
986 986
987 rt_mutex_init_task(p); 987 rt_mutex_init_task(p);
988 perf_counter_init_task(p);
988 989
989#ifdef CONFIG_PROVE_LOCKING 990#ifdef CONFIG_PROVE_LOCKING
990 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 991 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..37f771691f93
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,1686 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/file.h>
14#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/kernel_stat.h>
22#include <linux/perf_counter.h>
23
24/*
25 * Each CPU has a list of per CPU counters:
26 */
27DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
28
29int perf_max_counters __read_mostly = 1;
30static int perf_reserved_percpu __read_mostly;
31static int perf_overcommit __read_mostly = 1;
32
33/*
34 * Mutex for (sysadmin-configurable) counter reservations:
35 */
36static DEFINE_MUTEX(perf_resource_mutex);
37
38/*
39 * Architecture provided APIs - weak aliases:
40 */
41extern __weak const struct hw_perf_counter_ops *
42hw_perf_counter_init(struct perf_counter *counter)
43{
44 return ERR_PTR(-EINVAL);
45}
46
47u64 __weak hw_perf_save_disable(void) { return 0; }
48void __weak hw_perf_restore(u64 ctrl) { barrier(); }
49void __weak hw_perf_counter_setup(void) { barrier(); }
50
51static void
52list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
53{
54 struct perf_counter *group_leader = counter->group_leader;
55
56 /*
57 * Depending on whether it is a standalone or sibling counter,
58 * add it straight to the context's counter list, or to the group
59 * leader's sibling list:
60 */
61 if (counter->group_leader == counter)
62 list_add_tail(&counter->list_entry, &ctx->counter_list);
63 else
64 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
65}
66
67static void
68list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
69{
70 struct perf_counter *sibling, *tmp;
71
72 list_del_init(&counter->list_entry);
73
74 /*
75 * If this was a group counter with sibling counters then
76 * upgrade the siblings to singleton counters by adding them
77 * to the context list directly:
78 */
79 list_for_each_entry_safe(sibling, tmp,
80 &counter->sibling_list, list_entry) {
81
82 list_del_init(&sibling->list_entry);
83 list_add_tail(&sibling->list_entry, &ctx->counter_list);
84 sibling->group_leader = sibling;
85 }
86}
87
88/*
89 * Cross CPU call to remove a performance counter
90 *
91 * We disable the counter on the hardware level first. After that we
92 * remove it from the context list.
93 */
94static void __perf_counter_remove_from_context(void *info)
95{
96 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
97 struct perf_counter *counter = info;
98 struct perf_counter_context *ctx = counter->ctx;
99 unsigned long flags;
100 u64 perf_flags;
101
102 /*
103 * If this is a task context, we need to check whether it is
104 * the current task context of this cpu. If not it has been
105 * scheduled out before the smp call arrived.
106 */
107 if (ctx->task && cpuctx->task_ctx != ctx)
108 return;
109
110 curr_rq_lock_irq_save(&flags);
111 spin_lock(&ctx->lock);
112
113 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
114 counter->state = PERF_COUNTER_STATE_INACTIVE;
115 counter->hw_ops->disable(counter);
116 ctx->nr_active--;
117 cpuctx->active_oncpu--;
118 counter->task = NULL;
119 counter->oncpu = -1;
120 }
121 ctx->nr_counters--;
122
123 /*
124 * Protect the list operation against NMI by disabling the
125 * counters on a global level. NOP for non NMI based counters.
126 */
127 perf_flags = hw_perf_save_disable();
128 list_del_counter(counter, ctx);
129 hw_perf_restore(perf_flags);
130
131 if (!ctx->task) {
132 /*
133 * Allow more per task counters with respect to the
134 * reservation:
135 */
136 cpuctx->max_pertask =
137 min(perf_max_counters - ctx->nr_counters,
138 perf_max_counters - perf_reserved_percpu);
139 }
140
141 spin_unlock(&ctx->lock);
142 curr_rq_unlock_irq_restore(&flags);
143}
144
145
146/*
147 * Remove the counter from a task's (or a CPU's) list of counters.
148 *
149 * Must be called with counter->mutex held.
150 *
151 * CPU counters are removed with a smp call. For task counters we only
152 * call when the task is on a CPU.
153 */
154static void perf_counter_remove_from_context(struct perf_counter *counter)
155{
156 struct perf_counter_context *ctx = counter->ctx;
157 struct task_struct *task = ctx->task;
158
159 if (!task) {
160 /*
161 * Per cpu counters are removed via an smp call and
162 * the removal is always sucessful.
163 */
164 smp_call_function_single(counter->cpu,
165 __perf_counter_remove_from_context,
166 counter, 1);
167 return;
168 }
169
170retry:
171 task_oncpu_function_call(task, __perf_counter_remove_from_context,
172 counter);
173
174 spin_lock_irq(&ctx->lock);
175 /*
176 * If the context is active we need to retry the smp call.
177 */
178 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
179 spin_unlock_irq(&ctx->lock);
180 goto retry;
181 }
182
183 /*
184 * The lock prevents that this context is scheduled in so we
185 * can remove the counter safely, if the call above did not
186 * succeed.
187 */
188 if (!list_empty(&counter->list_entry)) {
189 ctx->nr_counters--;
190 list_del_counter(counter, ctx);
191 counter->task = NULL;
192 }
193 spin_unlock_irq(&ctx->lock);
194}
195
196static int
197counter_sched_in(struct perf_counter *counter,
198 struct perf_cpu_context *cpuctx,
199 struct perf_counter_context *ctx,
200 int cpu)
201{
202 if (counter->state == PERF_COUNTER_STATE_OFF)
203 return 0;
204
205 counter->state = PERF_COUNTER_STATE_ACTIVE;
206 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
207 /*
208 * The new state must be visible before we turn it on in the hardware:
209 */
210 smp_wmb();
211
212 if (counter->hw_ops->enable(counter)) {
213 counter->state = PERF_COUNTER_STATE_INACTIVE;
214 counter->oncpu = -1;
215 return -EAGAIN;
216 }
217
218 cpuctx->active_oncpu++;
219 ctx->nr_active++;
220
221 return 0;
222}
223
224/*
225 * Cross CPU call to install and enable a performance counter
226 */
227static void __perf_install_in_context(void *info)
228{
229 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
230 struct perf_counter *counter = info;
231 struct perf_counter_context *ctx = counter->ctx;
232 int cpu = smp_processor_id();
233 unsigned long flags;
234 u64 perf_flags;
235
236 /*
237 * If this is a task context, we need to check whether it is
238 * the current task context of this cpu. If not it has been
239 * scheduled out before the smp call arrived.
240 */
241 if (ctx->task && cpuctx->task_ctx != ctx)
242 return;
243
244 curr_rq_lock_irq_save(&flags);
245 spin_lock(&ctx->lock);
246
247 /*
248 * Protect the list operation against NMI by disabling the
249 * counters on a global level. NOP for non NMI based counters.
250 */
251 perf_flags = hw_perf_save_disable();
252
253 list_add_counter(counter, ctx);
254 ctx->nr_counters++;
255
256 counter_sched_in(counter, cpuctx, ctx, cpu);
257
258 if (!ctx->task && cpuctx->max_pertask)
259 cpuctx->max_pertask--;
260
261 hw_perf_restore(perf_flags);
262
263 spin_unlock(&ctx->lock);
264 curr_rq_unlock_irq_restore(&flags);
265}
266
267/*
268 * Attach a performance counter to a context
269 *
270 * First we add the counter to the list with the hardware enable bit
271 * in counter->hw_config cleared.
272 *
273 * If the counter is attached to a task which is on a CPU we use a smp
274 * call to enable it in the task context. The task might have been
275 * scheduled away, but we check this in the smp call again.
276 */
277static void
278perf_install_in_context(struct perf_counter_context *ctx,
279 struct perf_counter *counter,
280 int cpu)
281{
282 struct task_struct *task = ctx->task;
283
284 counter->ctx = ctx;
285 if (!task) {
286 /*
287 * Per cpu counters are installed via an smp call and
288 * the install is always sucessful.
289 */
290 smp_call_function_single(cpu, __perf_install_in_context,
291 counter, 1);
292 return;
293 }
294
295 counter->task = task;
296retry:
297 task_oncpu_function_call(task, __perf_install_in_context,
298 counter);
299
300 spin_lock_irq(&ctx->lock);
301 /*
302 * we need to retry the smp call.
303 */
304 if (ctx->nr_active && list_empty(&counter->list_entry)) {
305 spin_unlock_irq(&ctx->lock);
306 goto retry;
307 }
308
309 /*
310 * The lock prevents that this context is scheduled in so we
311 * can add the counter safely, if it the call above did not
312 * succeed.
313 */
314 if (list_empty(&counter->list_entry)) {
315 list_add_counter(counter, ctx);
316 ctx->nr_counters++;
317 }
318 spin_unlock_irq(&ctx->lock);
319}
320
321static void
322counter_sched_out(struct perf_counter *counter,
323 struct perf_cpu_context *cpuctx,
324 struct perf_counter_context *ctx)
325{
326 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
327 return;
328
329 counter->state = PERF_COUNTER_STATE_INACTIVE;
330 counter->hw_ops->disable(counter);
331 counter->oncpu = -1;
332
333 cpuctx->active_oncpu--;
334 ctx->nr_active--;
335}
336
337static void
338group_sched_out(struct perf_counter *group_counter,
339 struct perf_cpu_context *cpuctx,
340 struct perf_counter_context *ctx)
341{
342 struct perf_counter *counter;
343
344 counter_sched_out(group_counter, cpuctx, ctx);
345
346 /*
347 * Schedule out siblings (if any):
348 */
349 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
350 counter_sched_out(counter, cpuctx, ctx);
351}
352
353void __perf_counter_sched_out(struct perf_counter_context *ctx,
354 struct perf_cpu_context *cpuctx)
355{
356 struct perf_counter *counter;
357
358 if (likely(!ctx->nr_counters))
359 return;
360
361 spin_lock(&ctx->lock);
362 if (ctx->nr_active) {
363 list_for_each_entry(counter, &ctx->counter_list, list_entry)
364 group_sched_out(counter, cpuctx, ctx);
365 }
366 spin_unlock(&ctx->lock);
367}
368
369/*
370 * Called from scheduler to remove the counters of the current task,
371 * with interrupts disabled.
372 *
373 * We stop each counter and update the counter value in counter->count.
374 *
375 * This does not protect us against NMI, but disable()
376 * sets the disabled bit in the control field of counter _before_
377 * accessing the counter control register. If a NMI hits, then it will
378 * not restart the counter.
379 */
380void perf_counter_task_sched_out(struct task_struct *task, int cpu)
381{
382 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
383 struct perf_counter_context *ctx = &task->perf_counter_ctx;
384
385 if (likely(!cpuctx->task_ctx))
386 return;
387
388 __perf_counter_sched_out(ctx, cpuctx);
389
390 cpuctx->task_ctx = NULL;
391}
392
393static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
394{
395 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
396}
397
398static int
399group_sched_in(struct perf_counter *group_counter,
400 struct perf_cpu_context *cpuctx,
401 struct perf_counter_context *ctx,
402 int cpu)
403{
404 struct perf_counter *counter, *partial_group;
405 int ret = 0;
406
407 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
408 return -EAGAIN;
409
410 /*
411 * Schedule in siblings as one group (if any):
412 */
413 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
414 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
415 partial_group = counter;
416 goto group_error;
417 }
418 ret = -EAGAIN;
419 }
420
421 return ret;
422
423group_error:
424 /*
425 * Groups can be scheduled in as one unit only, so undo any
426 * partial group before returning:
427 */
428 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
429 if (counter == partial_group)
430 break;
431 counter_sched_out(counter, cpuctx, ctx);
432 }
433 counter_sched_out(group_counter, cpuctx, ctx);
434
435 return -EAGAIN;
436}
437
438static void
439__perf_counter_sched_in(struct perf_counter_context *ctx,
440 struct perf_cpu_context *cpuctx, int cpu)
441{
442 struct perf_counter *counter;
443
444 if (likely(!ctx->nr_counters))
445 return;
446
447 spin_lock(&ctx->lock);
448 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
449 /*
450 * Listen to the 'cpu' scheduling filter constraint
451 * of counters:
452 */
453 if (counter->cpu != -1 && counter->cpu != cpu)
454 continue;
455
456 /*
457 * If we scheduled in a group atomically and
458 * exclusively, break out:
459 */
460 if (group_sched_in(counter, cpuctx, ctx, cpu))
461 break;
462 }
463 spin_unlock(&ctx->lock);
464}
465
466/*
467 * Called from scheduler to add the counters of the current task
468 * with interrupts disabled.
469 *
470 * We restore the counter value and then enable it.
471 *
472 * This does not protect us against NMI, but enable()
473 * sets the enabled bit in the control field of counter _before_
474 * accessing the counter control register. If a NMI hits, then it will
475 * keep the counter running.
476 */
477void perf_counter_task_sched_in(struct task_struct *task, int cpu)
478{
479 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
480 struct perf_counter_context *ctx = &task->perf_counter_ctx;
481
482 __perf_counter_sched_in(ctx, cpuctx, cpu);
483 cpuctx->task_ctx = ctx;
484}
485
486static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
487{
488 struct perf_counter_context *ctx = &cpuctx->ctx;
489
490 __perf_counter_sched_in(ctx, cpuctx, cpu);
491}
492
493int perf_counter_task_disable(void)
494{
495 struct task_struct *curr = current;
496 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
497 struct perf_counter *counter;
498 unsigned long flags;
499 u64 perf_flags;
500 int cpu;
501
502 if (likely(!ctx->nr_counters))
503 return 0;
504
505 curr_rq_lock_irq_save(&flags);
506 cpu = smp_processor_id();
507
508 /* force the update of the task clock: */
509 __task_delta_exec(curr, 1);
510
511 perf_counter_task_sched_out(curr, cpu);
512
513 spin_lock(&ctx->lock);
514
515 /*
516 * Disable all the counters:
517 */
518 perf_flags = hw_perf_save_disable();
519
520 list_for_each_entry(counter, &ctx->counter_list, list_entry)
521 counter->state = PERF_COUNTER_STATE_OFF;
522
523 hw_perf_restore(perf_flags);
524
525 spin_unlock(&ctx->lock);
526
527 curr_rq_unlock_irq_restore(&flags);
528
529 return 0;
530}
531
532int perf_counter_task_enable(void)
533{
534 struct task_struct *curr = current;
535 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
536 struct perf_counter *counter;
537 unsigned long flags;
538 u64 perf_flags;
539 int cpu;
540
541 if (likely(!ctx->nr_counters))
542 return 0;
543
544 curr_rq_lock_irq_save(&flags);
545 cpu = smp_processor_id();
546
547 /* force the update of the task clock: */
548 __task_delta_exec(curr, 1);
549
550 perf_counter_task_sched_out(curr, cpu);
551
552 spin_lock(&ctx->lock);
553
554 /*
555 * Disable all the counters:
556 */
557 perf_flags = hw_perf_save_disable();
558
559 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
560 if (counter->state != PERF_COUNTER_STATE_OFF)
561 continue;
562 counter->state = PERF_COUNTER_STATE_INACTIVE;
563 counter->hw_event.disabled = 0;
564 }
565 hw_perf_restore(perf_flags);
566
567 spin_unlock(&ctx->lock);
568
569 perf_counter_task_sched_in(curr, cpu);
570
571 curr_rq_unlock_irq_restore(&flags);
572
573 return 0;
574}
575
576/*
577 * Round-robin a context's counters:
578 */
579static void rotate_ctx(struct perf_counter_context *ctx)
580{
581 struct perf_counter *counter;
582 u64 perf_flags;
583
584 if (!ctx->nr_counters)
585 return;
586
587 spin_lock(&ctx->lock);
588 /*
589 * Rotate the first entry last (works just fine for group counters too):
590 */
591 perf_flags = hw_perf_save_disable();
592 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
593 list_del(&counter->list_entry);
594 list_add_tail(&counter->list_entry, &ctx->counter_list);
595 break;
596 }
597 hw_perf_restore(perf_flags);
598
599 spin_unlock(&ctx->lock);
600}
601
602void perf_counter_task_tick(struct task_struct *curr, int cpu)
603{
604 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
605 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
606 const int rotate_percpu = 0;
607
608 if (rotate_percpu)
609 perf_counter_cpu_sched_out(cpuctx);
610 perf_counter_task_sched_out(curr, cpu);
611
612 if (rotate_percpu)
613 rotate_ctx(&cpuctx->ctx);
614 rotate_ctx(ctx);
615
616 if (rotate_percpu)
617 perf_counter_cpu_sched_in(cpuctx, cpu);
618 perf_counter_task_sched_in(curr, cpu);
619}
620
621/*
622 * Cross CPU call to read the hardware counter
623 */
624static void __read(void *info)
625{
626 struct perf_counter *counter = info;
627 unsigned long flags;
628
629 curr_rq_lock_irq_save(&flags);
630 counter->hw_ops->read(counter);
631 curr_rq_unlock_irq_restore(&flags);
632}
633
634static u64 perf_counter_read(struct perf_counter *counter)
635{
636 /*
637 * If counter is enabled and currently active on a CPU, update the
638 * value in the counter structure:
639 */
640 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
641 smp_call_function_single(counter->oncpu,
642 __read, counter, 1);
643 }
644
645 return atomic64_read(&counter->count);
646}
647
648/*
649 * Cross CPU call to switch performance data pointers
650 */
651static void __perf_switch_irq_data(void *info)
652{
653 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
654 struct perf_counter *counter = info;
655 struct perf_counter_context *ctx = counter->ctx;
656 struct perf_data *oldirqdata = counter->irqdata;
657
658 /*
659 * If this is a task context, we need to check whether it is
660 * the current task context of this cpu. If not it has been
661 * scheduled out before the smp call arrived.
662 */
663 if (ctx->task) {
664 if (cpuctx->task_ctx != ctx)
665 return;
666 spin_lock(&ctx->lock);
667 }
668
669 /* Change the pointer NMI safe */
670 atomic_long_set((atomic_long_t *)&counter->irqdata,
671 (unsigned long) counter->usrdata);
672 counter->usrdata = oldirqdata;
673
674 if (ctx->task)
675 spin_unlock(&ctx->lock);
676}
677
678static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
679{
680 struct perf_counter_context *ctx = counter->ctx;
681 struct perf_data *oldirqdata = counter->irqdata;
682 struct task_struct *task = ctx->task;
683
684 if (!task) {
685 smp_call_function_single(counter->cpu,
686 __perf_switch_irq_data,
687 counter, 1);
688 return counter->usrdata;
689 }
690
691retry:
692 spin_lock_irq(&ctx->lock);
693 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
694 counter->irqdata = counter->usrdata;
695 counter->usrdata = oldirqdata;
696 spin_unlock_irq(&ctx->lock);
697 return oldirqdata;
698 }
699 spin_unlock_irq(&ctx->lock);
700 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
701 /* Might have failed, because task was scheduled out */
702 if (counter->irqdata == oldirqdata)
703 goto retry;
704
705 return counter->usrdata;
706}
707
708static void put_context(struct perf_counter_context *ctx)
709{
710 if (ctx->task)
711 put_task_struct(ctx->task);
712}
713
714static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
715{
716 struct perf_cpu_context *cpuctx;
717 struct perf_counter_context *ctx;
718 struct task_struct *task;
719
720 /*
721 * If cpu is not a wildcard then this is a percpu counter:
722 */
723 if (cpu != -1) {
724 /* Must be root to operate on a CPU counter: */
725 if (!capable(CAP_SYS_ADMIN))
726 return ERR_PTR(-EACCES);
727
728 if (cpu < 0 || cpu > num_possible_cpus())
729 return ERR_PTR(-EINVAL);
730
731 /*
732 * We could be clever and allow to attach a counter to an
733 * offline CPU and activate it when the CPU comes up, but
734 * that's for later.
735 */
736 if (!cpu_isset(cpu, cpu_online_map))
737 return ERR_PTR(-ENODEV);
738
739 cpuctx = &per_cpu(perf_cpu_context, cpu);
740 ctx = &cpuctx->ctx;
741
742 return ctx;
743 }
744
745 rcu_read_lock();
746 if (!pid)
747 task = current;
748 else
749 task = find_task_by_vpid(pid);
750 if (task)
751 get_task_struct(task);
752 rcu_read_unlock();
753
754 if (!task)
755 return ERR_PTR(-ESRCH);
756
757 ctx = &task->perf_counter_ctx;
758 ctx->task = task;
759
760 /* Reuse ptrace permission checks for now. */
761 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
762 put_context(ctx);
763 return ERR_PTR(-EACCES);
764 }
765
766 return ctx;
767}
768
769/*
770 * Called when the last reference to the file is gone.
771 */
772static int perf_release(struct inode *inode, struct file *file)
773{
774 struct perf_counter *counter = file->private_data;
775 struct perf_counter_context *ctx = counter->ctx;
776
777 file->private_data = NULL;
778
779 mutex_lock(&counter->mutex);
780
781 perf_counter_remove_from_context(counter);
782 put_context(ctx);
783
784 mutex_unlock(&counter->mutex);
785
786 kfree(counter);
787
788 return 0;
789}
790
791/*
792 * Read the performance counter - simple non blocking version for now
793 */
794static ssize_t
795perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
796{
797 u64 cntval;
798
799 if (count != sizeof(cntval))
800 return -EINVAL;
801
802 mutex_lock(&counter->mutex);
803 cntval = perf_counter_read(counter);
804 mutex_unlock(&counter->mutex);
805
806 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
807}
808
809static ssize_t
810perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
811{
812 if (!usrdata->len)
813 return 0;
814
815 count = min(count, (size_t)usrdata->len);
816 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
817 return -EFAULT;
818
819 /* Adjust the counters */
820 usrdata->len -= count;
821 if (!usrdata->len)
822 usrdata->rd_idx = 0;
823 else
824 usrdata->rd_idx += count;
825
826 return count;
827}
828
829static ssize_t
830perf_read_irq_data(struct perf_counter *counter,
831 char __user *buf,
832 size_t count,
833 int nonblocking)
834{
835 struct perf_data *irqdata, *usrdata;
836 DECLARE_WAITQUEUE(wait, current);
837 ssize_t res;
838
839 irqdata = counter->irqdata;
840 usrdata = counter->usrdata;
841
842 if (usrdata->len + irqdata->len >= count)
843 goto read_pending;
844
845 if (nonblocking)
846 return -EAGAIN;
847
848 spin_lock_irq(&counter->waitq.lock);
849 __add_wait_queue(&counter->waitq, &wait);
850 for (;;) {
851 set_current_state(TASK_INTERRUPTIBLE);
852 if (usrdata->len + irqdata->len >= count)
853 break;
854
855 if (signal_pending(current))
856 break;
857
858 spin_unlock_irq(&counter->waitq.lock);
859 schedule();
860 spin_lock_irq(&counter->waitq.lock);
861 }
862 __remove_wait_queue(&counter->waitq, &wait);
863 __set_current_state(TASK_RUNNING);
864 spin_unlock_irq(&counter->waitq.lock);
865
866 if (usrdata->len + irqdata->len < count)
867 return -ERESTARTSYS;
868read_pending:
869 mutex_lock(&counter->mutex);
870
871 /* Drain pending data first: */
872 res = perf_copy_usrdata(usrdata, buf, count);
873 if (res < 0 || res == count)
874 goto out;
875
876 /* Switch irq buffer: */
877 usrdata = perf_switch_irq_data(counter);
878 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
879 if (!res)
880 res = -EFAULT;
881 } else {
882 res = count;
883 }
884out:
885 mutex_unlock(&counter->mutex);
886
887 return res;
888}
889
890static ssize_t
891perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
892{
893 struct perf_counter *counter = file->private_data;
894
895 switch (counter->hw_event.record_type) {
896 case PERF_RECORD_SIMPLE:
897 return perf_read_hw(counter, buf, count);
898
899 case PERF_RECORD_IRQ:
900 case PERF_RECORD_GROUP:
901 return perf_read_irq_data(counter, buf, count,
902 file->f_flags & O_NONBLOCK);
903 }
904 return -EINVAL;
905}
906
907static unsigned int perf_poll(struct file *file, poll_table *wait)
908{
909 struct perf_counter *counter = file->private_data;
910 unsigned int events = 0;
911 unsigned long flags;
912
913 poll_wait(file, &counter->waitq, wait);
914
915 spin_lock_irqsave(&counter->waitq.lock, flags);
916 if (counter->usrdata->len || counter->irqdata->len)
917 events |= POLLIN;
918 spin_unlock_irqrestore(&counter->waitq.lock, flags);
919
920 return events;
921}
922
923static const struct file_operations perf_fops = {
924 .release = perf_release,
925 .read = perf_read,
926 .poll = perf_poll,
927};
928
929static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
930{
931 return 0;
932}
933
934static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
935{
936}
937
938static void cpu_clock_perf_counter_read(struct perf_counter *counter)
939{
940 int cpu = raw_smp_processor_id();
941
942 atomic64_set(&counter->count, cpu_clock(cpu));
943}
944
945static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
946 .enable = cpu_clock_perf_counter_enable,
947 .disable = cpu_clock_perf_counter_disable,
948 .read = cpu_clock_perf_counter_read,
949};
950
951/*
952 * Called from within the scheduler:
953 */
954static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
955{
956 struct task_struct *curr = counter->task;
957 u64 delta;
958
959 delta = __task_delta_exec(curr, update);
960
961 return curr->se.sum_exec_runtime + delta;
962}
963
964static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
965{
966 u64 prev;
967 s64 delta;
968
969 prev = atomic64_read(&counter->hw.prev_count);
970
971 atomic64_set(&counter->hw.prev_count, now);
972
973 delta = now - prev;
974
975 atomic64_add(delta, &counter->count);
976}
977
978static void task_clock_perf_counter_read(struct perf_counter *counter)
979{
980 u64 now = task_clock_perf_counter_val(counter, 1);
981
982 task_clock_perf_counter_update(counter, now);
983}
984
985static int task_clock_perf_counter_enable(struct perf_counter *counter)
986{
987 u64 now = task_clock_perf_counter_val(counter, 0);
988
989 atomic64_set(&counter->hw.prev_count, now);
990
991 return 0;
992}
993
994static void task_clock_perf_counter_disable(struct perf_counter *counter)
995{
996 u64 now = task_clock_perf_counter_val(counter, 0);
997
998 task_clock_perf_counter_update(counter, now);
999}
1000
1001static const struct hw_perf_counter_ops perf_ops_task_clock = {
1002 .enable = task_clock_perf_counter_enable,
1003 .disable = task_clock_perf_counter_disable,
1004 .read = task_clock_perf_counter_read,
1005};
1006
1007static u64 get_page_faults(void)
1008{
1009 struct task_struct *curr = current;
1010
1011 return curr->maj_flt + curr->min_flt;
1012}
1013
1014static void page_faults_perf_counter_update(struct perf_counter *counter)
1015{
1016 u64 prev, now;
1017 s64 delta;
1018
1019 prev = atomic64_read(&counter->hw.prev_count);
1020 now = get_page_faults();
1021
1022 atomic64_set(&counter->hw.prev_count, now);
1023
1024 delta = now - prev;
1025
1026 atomic64_add(delta, &counter->count);
1027}
1028
1029static void page_faults_perf_counter_read(struct perf_counter *counter)
1030{
1031 page_faults_perf_counter_update(counter);
1032}
1033
1034static int page_faults_perf_counter_enable(struct perf_counter *counter)
1035{
1036 /*
1037 * page-faults is a per-task value already,
1038 * so we dont have to clear it on switch-in.
1039 */
1040
1041 return 0;
1042}
1043
1044static void page_faults_perf_counter_disable(struct perf_counter *counter)
1045{
1046 page_faults_perf_counter_update(counter);
1047}
1048
1049static const struct hw_perf_counter_ops perf_ops_page_faults = {
1050 .enable = page_faults_perf_counter_enable,
1051 .disable = page_faults_perf_counter_disable,
1052 .read = page_faults_perf_counter_read,
1053};
1054
1055static u64 get_context_switches(void)
1056{
1057 struct task_struct *curr = current;
1058
1059 return curr->nvcsw + curr->nivcsw;
1060}
1061
1062static void context_switches_perf_counter_update(struct perf_counter *counter)
1063{
1064 u64 prev, now;
1065 s64 delta;
1066
1067 prev = atomic64_read(&counter->hw.prev_count);
1068 now = get_context_switches();
1069
1070 atomic64_set(&counter->hw.prev_count, now);
1071
1072 delta = now - prev;
1073
1074 atomic64_add(delta, &counter->count);
1075}
1076
1077static void context_switches_perf_counter_read(struct perf_counter *counter)
1078{
1079 context_switches_perf_counter_update(counter);
1080}
1081
1082static int context_switches_perf_counter_enable(struct perf_counter *counter)
1083{
1084 /*
1085 * ->nvcsw + curr->nivcsw is a per-task value already,
1086 * so we dont have to clear it on switch-in.
1087 */
1088
1089 return 0;
1090}
1091
1092static void context_switches_perf_counter_disable(struct perf_counter *counter)
1093{
1094 context_switches_perf_counter_update(counter);
1095}
1096
1097static const struct hw_perf_counter_ops perf_ops_context_switches = {
1098 .enable = context_switches_perf_counter_enable,
1099 .disable = context_switches_perf_counter_disable,
1100 .read = context_switches_perf_counter_read,
1101};
1102
1103static inline u64 get_cpu_migrations(void)
1104{
1105 return current->se.nr_migrations;
1106}
1107
1108static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1109{
1110 u64 prev, now;
1111 s64 delta;
1112
1113 prev = atomic64_read(&counter->hw.prev_count);
1114 now = get_cpu_migrations();
1115
1116 atomic64_set(&counter->hw.prev_count, now);
1117
1118 delta = now - prev;
1119
1120 atomic64_add(delta, &counter->count);
1121}
1122
1123static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1124{
1125 cpu_migrations_perf_counter_update(counter);
1126}
1127
1128static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1129{
1130 /*
1131 * se.nr_migrations is a per-task value already,
1132 * so we dont have to clear it on switch-in.
1133 */
1134
1135 return 0;
1136}
1137
1138static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1139{
1140 cpu_migrations_perf_counter_update(counter);
1141}
1142
1143static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1144 .enable = cpu_migrations_perf_counter_enable,
1145 .disable = cpu_migrations_perf_counter_disable,
1146 .read = cpu_migrations_perf_counter_read,
1147};
1148
1149static const struct hw_perf_counter_ops *
1150sw_perf_counter_init(struct perf_counter *counter)
1151{
1152 const struct hw_perf_counter_ops *hw_ops = NULL;
1153
1154 switch (counter->hw_event.type) {
1155 case PERF_COUNT_CPU_CLOCK:
1156 hw_ops = &perf_ops_cpu_clock;
1157 break;
1158 case PERF_COUNT_TASK_CLOCK:
1159 hw_ops = &perf_ops_task_clock;
1160 break;
1161 case PERF_COUNT_PAGE_FAULTS:
1162 hw_ops = &perf_ops_page_faults;
1163 break;
1164 case PERF_COUNT_CONTEXT_SWITCHES:
1165 hw_ops = &perf_ops_context_switches;
1166 break;
1167 case PERF_COUNT_CPU_MIGRATIONS:
1168 hw_ops = &perf_ops_cpu_migrations;
1169 break;
1170 default:
1171 break;
1172 }
1173 return hw_ops;
1174}
1175
1176/*
1177 * Allocate and initialize a counter structure
1178 */
1179static struct perf_counter *
1180perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1181 int cpu,
1182 struct perf_counter *group_leader,
1183 gfp_t gfpflags)
1184{
1185 const struct hw_perf_counter_ops *hw_ops;
1186 struct perf_counter *counter;
1187
1188 counter = kzalloc(sizeof(*counter), gfpflags);
1189 if (!counter)
1190 return NULL;
1191
1192 /*
1193 * Single counters are their own group leaders, with an
1194 * empty sibling list:
1195 */
1196 if (!group_leader)
1197 group_leader = counter;
1198
1199 mutex_init(&counter->mutex);
1200 INIT_LIST_HEAD(&counter->list_entry);
1201 INIT_LIST_HEAD(&counter->sibling_list);
1202 init_waitqueue_head(&counter->waitq);
1203
1204 counter->irqdata = &counter->data[0];
1205 counter->usrdata = &counter->data[1];
1206 counter->cpu = cpu;
1207 counter->hw_event = *hw_event;
1208 counter->wakeup_pending = 0;
1209 counter->group_leader = group_leader;
1210 counter->hw_ops = NULL;
1211
1212 counter->state = PERF_COUNTER_STATE_INACTIVE;
1213 if (hw_event->disabled)
1214 counter->state = PERF_COUNTER_STATE_OFF;
1215
1216 hw_ops = NULL;
1217 if (!hw_event->raw && hw_event->type < 0)
1218 hw_ops = sw_perf_counter_init(counter);
1219 if (!hw_ops)
1220 hw_ops = hw_perf_counter_init(counter);
1221
1222 if (!hw_ops) {
1223 kfree(counter);
1224 return NULL;
1225 }
1226 counter->hw_ops = hw_ops;
1227
1228 return counter;
1229}
1230
1231/**
1232 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1233 *
1234 * @hw_event_uptr: event type attributes for monitoring/sampling
1235 * @pid: target pid
1236 * @cpu: target cpu
1237 * @group_fd: group leader counter fd
1238 */
1239asmlinkage int
1240sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1241 pid_t pid, int cpu, int group_fd)
1242{
1243 struct perf_counter *counter, *group_leader;
1244 struct perf_counter_hw_event hw_event;
1245 struct perf_counter_context *ctx;
1246 struct file *counter_file = NULL;
1247 struct file *group_file = NULL;
1248 int fput_needed = 0;
1249 int fput_needed2 = 0;
1250 int ret;
1251
1252 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1253 return -EFAULT;
1254
1255 /*
1256 * Get the target context (task or percpu):
1257 */
1258 ctx = find_get_context(pid, cpu);
1259 if (IS_ERR(ctx))
1260 return PTR_ERR(ctx);
1261
1262 /*
1263 * Look up the group leader (we will attach this counter to it):
1264 */
1265 group_leader = NULL;
1266 if (group_fd != -1) {
1267 ret = -EINVAL;
1268 group_file = fget_light(group_fd, &fput_needed);
1269 if (!group_file)
1270 goto err_put_context;
1271 if (group_file->f_op != &perf_fops)
1272 goto err_put_context;
1273
1274 group_leader = group_file->private_data;
1275 /*
1276 * Do not allow a recursive hierarchy (this new sibling
1277 * becoming part of another group-sibling):
1278 */
1279 if (group_leader->group_leader != group_leader)
1280 goto err_put_context;
1281 /*
1282 * Do not allow to attach to a group in a different
1283 * task or CPU context:
1284 */
1285 if (group_leader->ctx != ctx)
1286 goto err_put_context;
1287 }
1288
1289 ret = -EINVAL;
1290 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1291 if (!counter)
1292 goto err_put_context;
1293
1294 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1295 if (ret < 0)
1296 goto err_free_put_context;
1297
1298 counter_file = fget_light(ret, &fput_needed2);
1299 if (!counter_file)
1300 goto err_free_put_context;
1301
1302 counter->filp = counter_file;
1303 perf_install_in_context(ctx, counter, cpu);
1304
1305 fput_light(counter_file, fput_needed2);
1306
1307out_fput:
1308 fput_light(group_file, fput_needed);
1309
1310 return ret;
1311
1312err_free_put_context:
1313 kfree(counter);
1314
1315err_put_context:
1316 put_context(ctx);
1317
1318 goto out_fput;
1319}
1320
1321/*
1322 * Initialize the perf_counter context in a task_struct:
1323 */
1324static void
1325__perf_counter_init_context(struct perf_counter_context *ctx,
1326 struct task_struct *task)
1327{
1328 memset(ctx, 0, sizeof(*ctx));
1329 spin_lock_init(&ctx->lock);
1330 INIT_LIST_HEAD(&ctx->counter_list);
1331 ctx->task = task;
1332}
1333
1334/*
1335 * inherit a counter from parent task to child task:
1336 */
1337static int
1338inherit_counter(struct perf_counter *parent_counter,
1339 struct task_struct *parent,
1340 struct perf_counter_context *parent_ctx,
1341 struct task_struct *child,
1342 struct perf_counter_context *child_ctx)
1343{
1344 struct perf_counter *child_counter;
1345
1346 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1347 parent_counter->cpu, NULL,
1348 GFP_ATOMIC);
1349 if (!child_counter)
1350 return -ENOMEM;
1351
1352 /*
1353 * Link it up in the child's context:
1354 */
1355 child_counter->ctx = child_ctx;
1356 child_counter->task = child;
1357 list_add_counter(child_counter, child_ctx);
1358 child_ctx->nr_counters++;
1359
1360 child_counter->parent = parent_counter;
1361 /*
1362 * inherit into child's child as well:
1363 */
1364 child_counter->hw_event.inherit = 1;
1365
1366 /*
1367 * Get a reference to the parent filp - we will fput it
1368 * when the child counter exits. This is safe to do because
1369 * we are in the parent and we know that the filp still
1370 * exists and has a nonzero count:
1371 */
1372 atomic_long_inc(&parent_counter->filp->f_count);
1373
1374 return 0;
1375}
1376
1377static void
1378__perf_counter_exit_task(struct task_struct *child,
1379 struct perf_counter *child_counter,
1380 struct perf_counter_context *child_ctx)
1381{
1382 struct perf_counter *parent_counter;
1383 u64 parent_val, child_val;
1384
1385 /*
1386 * If we do not self-reap then we have to wait for the
1387 * child task to unschedule (it will happen for sure),
1388 * so that its counter is at its final count. (This
1389 * condition triggers rarely - child tasks usually get
1390 * off their CPU before the parent has a chance to
1391 * get this far into the reaping action)
1392 */
1393 if (child != current) {
1394 wait_task_inactive(child, 0);
1395 list_del_init(&child_counter->list_entry);
1396 } else {
1397 struct perf_cpu_context *cpuctx;
1398 unsigned long flags;
1399 u64 perf_flags;
1400
1401 /*
1402 * Disable and unlink this counter.
1403 *
1404 * Be careful about zapping the list - IRQ/NMI context
1405 * could still be processing it:
1406 */
1407 curr_rq_lock_irq_save(&flags);
1408 perf_flags = hw_perf_save_disable();
1409
1410 cpuctx = &__get_cpu_var(perf_cpu_context);
1411
1412 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
1413 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1414 child_counter->hw_ops->disable(child_counter);
1415 cpuctx->active_oncpu--;
1416 child_ctx->nr_active--;
1417 child_counter->oncpu = -1;
1418 }
1419
1420 list_del_init(&child_counter->list_entry);
1421
1422 child_ctx->nr_counters--;
1423
1424 hw_perf_restore(perf_flags);
1425 curr_rq_unlock_irq_restore(&flags);
1426 }
1427
1428 parent_counter = child_counter->parent;
1429 /*
1430 * It can happen that parent exits first, and has counters
1431 * that are still around due to the child reference. These
1432 * counters need to be zapped - but otherwise linger.
1433 */
1434 if (!parent_counter)
1435 return;
1436
1437 parent_val = atomic64_read(&parent_counter->count);
1438 child_val = atomic64_read(&child_counter->count);
1439
1440 /*
1441 * Add back the child's count to the parent's count:
1442 */
1443 atomic64_add(child_val, &parent_counter->count);
1444
1445 fput(parent_counter->filp);
1446
1447 kfree(child_counter);
1448}
1449
1450/*
1451 * When a child task exist, feed back counter values to parent counters.
1452 *
1453 * Note: we are running in child context, but the PID is not hashed
1454 * anymore so new counters will not be added.
1455 */
1456void perf_counter_exit_task(struct task_struct *child)
1457{
1458 struct perf_counter *child_counter, *tmp;
1459 struct perf_counter_context *child_ctx;
1460
1461 child_ctx = &child->perf_counter_ctx;
1462
1463 if (likely(!child_ctx->nr_counters))
1464 return;
1465
1466 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1467 list_entry)
1468 __perf_counter_exit_task(child, child_counter, child_ctx);
1469}
1470
1471/*
1472 * Initialize the perf_counter context in task_struct
1473 */
1474void perf_counter_init_task(struct task_struct *child)
1475{
1476 struct perf_counter_context *child_ctx, *parent_ctx;
1477 struct perf_counter *counter, *parent_counter;
1478 struct task_struct *parent = current;
1479 unsigned long flags;
1480
1481 child_ctx = &child->perf_counter_ctx;
1482 parent_ctx = &parent->perf_counter_ctx;
1483
1484 __perf_counter_init_context(child_ctx, child);
1485
1486 /*
1487 * This is executed from the parent task context, so inherit
1488 * counters that have been marked for cloning:
1489 */
1490
1491 if (likely(!parent_ctx->nr_counters))
1492 return;
1493
1494 /*
1495 * Lock the parent list. No need to lock the child - not PID
1496 * hashed yet and not running, so nobody can access it.
1497 */
1498 spin_lock_irqsave(&parent_ctx->lock, flags);
1499
1500 /*
1501 * We dont have to disable NMIs - we are only looking at
1502 * the list, not manipulating it:
1503 */
1504 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1505 if (!counter->hw_event.inherit || counter->group_leader != counter)
1506 continue;
1507
1508 /*
1509 * Instead of creating recursive hierarchies of counters,
1510 * we link inheritd counters back to the original parent,
1511 * which has a filp for sure, which we use as the reference
1512 * count:
1513 */
1514 parent_counter = counter;
1515 if (counter->parent)
1516 parent_counter = counter->parent;
1517
1518 if (inherit_counter(parent_counter, parent,
1519 parent_ctx, child, child_ctx))
1520 break;
1521 }
1522
1523 spin_unlock_irqrestore(&parent_ctx->lock, flags);
1524}
1525
1526static void __cpuinit perf_counter_init_cpu(int cpu)
1527{
1528 struct perf_cpu_context *cpuctx;
1529
1530 cpuctx = &per_cpu(perf_cpu_context, cpu);
1531 __perf_counter_init_context(&cpuctx->ctx, NULL);
1532
1533 mutex_lock(&perf_resource_mutex);
1534 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
1535 mutex_unlock(&perf_resource_mutex);
1536
1537 hw_perf_counter_setup();
1538}
1539
1540#ifdef CONFIG_HOTPLUG_CPU
1541static void __perf_counter_exit_cpu(void *info)
1542{
1543 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1544 struct perf_counter_context *ctx = &cpuctx->ctx;
1545 struct perf_counter *counter, *tmp;
1546
1547 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1548 __perf_counter_remove_from_context(counter);
1549
1550}
1551static void perf_counter_exit_cpu(int cpu)
1552{
1553 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
1554}
1555#else
1556static inline void perf_counter_exit_cpu(int cpu) { }
1557#endif
1558
1559static int __cpuinit
1560perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
1561{
1562 unsigned int cpu = (long)hcpu;
1563
1564 switch (action) {
1565
1566 case CPU_UP_PREPARE:
1567 case CPU_UP_PREPARE_FROZEN:
1568 perf_counter_init_cpu(cpu);
1569 break;
1570
1571 case CPU_DOWN_PREPARE:
1572 case CPU_DOWN_PREPARE_FROZEN:
1573 perf_counter_exit_cpu(cpu);
1574 break;
1575
1576 default:
1577 break;
1578 }
1579
1580 return NOTIFY_OK;
1581}
1582
1583static struct notifier_block __cpuinitdata perf_cpu_nb = {
1584 .notifier_call = perf_cpu_notify,
1585};
1586
1587static int __init perf_counter_init(void)
1588{
1589 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1590 (void *)(long)smp_processor_id());
1591 register_cpu_notifier(&perf_cpu_nb);
1592
1593 return 0;
1594}
1595early_initcall(perf_counter_init);
1596
1597static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1598{
1599 return sprintf(buf, "%d\n", perf_reserved_percpu);
1600}
1601
1602static ssize_t
1603perf_set_reserve_percpu(struct sysdev_class *class,
1604 const char *buf,
1605 size_t count)
1606{
1607 struct perf_cpu_context *cpuctx;
1608 unsigned long val;
1609 int err, cpu, mpt;
1610
1611 err = strict_strtoul(buf, 10, &val);
1612 if (err)
1613 return err;
1614 if (val > perf_max_counters)
1615 return -EINVAL;
1616
1617 mutex_lock(&perf_resource_mutex);
1618 perf_reserved_percpu = val;
1619 for_each_online_cpu(cpu) {
1620 cpuctx = &per_cpu(perf_cpu_context, cpu);
1621 spin_lock_irq(&cpuctx->ctx.lock);
1622 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1623 perf_max_counters - perf_reserved_percpu);
1624 cpuctx->max_pertask = mpt;
1625 spin_unlock_irq(&cpuctx->ctx.lock);
1626 }
1627 mutex_unlock(&perf_resource_mutex);
1628
1629 return count;
1630}
1631
1632static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1633{
1634 return sprintf(buf, "%d\n", perf_overcommit);
1635}
1636
1637static ssize_t
1638perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1639{
1640 unsigned long val;
1641 int err;
1642
1643 err = strict_strtoul(buf, 10, &val);
1644 if (err)
1645 return err;
1646 if (val > 1)
1647 return -EINVAL;
1648
1649 mutex_lock(&perf_resource_mutex);
1650 perf_overcommit = val;
1651 mutex_unlock(&perf_resource_mutex);
1652
1653 return count;
1654}
1655
1656static SYSDEV_CLASS_ATTR(
1657 reserve_percpu,
1658 0644,
1659 perf_show_reserve_percpu,
1660 perf_set_reserve_percpu
1661 );
1662
1663static SYSDEV_CLASS_ATTR(
1664 overcommit,
1665 0644,
1666 perf_show_overcommit,
1667 perf_set_overcommit
1668 );
1669
1670static struct attribute *perfclass_attrs[] = {
1671 &attr_reserve_percpu.attr,
1672 &attr_overcommit.attr,
1673 NULL
1674};
1675
1676static struct attribute_group perfclass_attr_group = {
1677 .attrs = perfclass_attrs,
1678 .name = "perf_counters",
1679};
1680
1681static int __init perf_counter_sysfs_init(void)
1682{
1683 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1684 &perfclass_attr_group);
1685}
1686device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index deb5ac8c12f3..43fd21233b93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -665,7 +665,7 @@ static inline int cpu_of(struct rq *rq)
665#define task_rq(p) cpu_rq(task_cpu(p)) 665#define task_rq(p) cpu_rq(task_cpu(p))
666#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 666#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
667 667
668static inline void update_rq_clock(struct rq *rq) 668inline void update_rq_clock(struct rq *rq)
669{ 669{
670 rq->clock = sched_clock_cpu(cpu_of(rq)); 670 rq->clock = sched_clock_cpu(cpu_of(rq));
671} 671}
@@ -976,6 +976,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
976 } 976 }
977} 977}
978 978
979void curr_rq_lock_irq_save(unsigned long *flags)
980 __acquires(rq->lock)
981{
982 struct rq *rq;
983
984 local_irq_save(*flags);
985 rq = cpu_rq(smp_processor_id());
986 spin_lock(&rq->lock);
987}
988
989void curr_rq_unlock_irq_restore(unsigned long *flags)
990 __releases(rq->lock)
991{
992 struct rq *rq;
993
994 rq = cpu_rq(smp_processor_id());
995 spin_unlock(&rq->lock);
996 local_irq_restore(*flags);
997}
998
979void task_rq_unlock_wait(struct task_struct *p) 999void task_rq_unlock_wait(struct task_struct *p)
980{ 1000{
981 struct rq *rq = task_rq(p); 1001 struct rq *rq = task_rq(p);
@@ -1882,12 +1902,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1882 p->se.sleep_start -= clock_offset; 1902 p->se.sleep_start -= clock_offset;
1883 if (p->se.block_start) 1903 if (p->se.block_start)
1884 p->se.block_start -= clock_offset; 1904 p->se.block_start -= clock_offset;
1905#endif
1885 if (old_cpu != new_cpu) { 1906 if (old_cpu != new_cpu) {
1886 schedstat_inc(p, se.nr_migrations); 1907 p->se.nr_migrations++;
1908#ifdef CONFIG_SCHEDSTATS
1887 if (task_hot(p, old_rq->clock, NULL)) 1909 if (task_hot(p, old_rq->clock, NULL))
1888 schedstat_inc(p, se.nr_forced2_migrations); 1910 schedstat_inc(p, se.nr_forced2_migrations);
1889 }
1890#endif 1911#endif
1912 }
1891 p->se.vruntime -= old_cfsrq->min_vruntime - 1913 p->se.vruntime -= old_cfsrq->min_vruntime -
1892 new_cfsrq->min_vruntime; 1914 new_cfsrq->min_vruntime;
1893 1915
@@ -2239,6 +2261,27 @@ static int sched_balance_self(int cpu, int flag)
2239 2261
2240#endif /* CONFIG_SMP */ 2262#endif /* CONFIG_SMP */
2241 2263
2264/**
2265 * task_oncpu_function_call - call a function on the cpu on which a task runs
2266 * @p: the task to evaluate
2267 * @func: the function to be called
2268 * @info: the function call argument
2269 *
2270 * Calls the function @func when the task is currently running. This might
2271 * be on the current CPU, which just calls the function directly
2272 */
2273void task_oncpu_function_call(struct task_struct *p,
2274 void (*func) (void *info), void *info)
2275{
2276 int cpu;
2277
2278 preempt_disable();
2279 cpu = task_cpu(p);
2280 if (task_curr(p))
2281 smp_call_function_single(cpu, func, info, 1);
2282 preempt_enable();
2283}
2284
2242/*** 2285/***
2243 * try_to_wake_up - wake up a thread 2286 * try_to_wake_up - wake up a thread
2244 * @p: the to-be-woken-up thread 2287 * @p: the to-be-woken-up thread
@@ -2381,6 +2424,7 @@ static void __sched_fork(struct task_struct *p)
2381 p->se.exec_start = 0; 2424 p->se.exec_start = 0;
2382 p->se.sum_exec_runtime = 0; 2425 p->se.sum_exec_runtime = 0;
2383 p->se.prev_sum_exec_runtime = 0; 2426 p->se.prev_sum_exec_runtime = 0;
2427 p->se.nr_migrations = 0;
2384 p->se.last_wakeup = 0; 2428 p->se.last_wakeup = 0;
2385 p->se.avg_overlap = 0; 2429 p->se.avg_overlap = 0;
2386 2430
@@ -2601,6 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2601 */ 2645 */
2602 prev_state = prev->state; 2646 prev_state = prev->state;
2603 finish_arch_switch(prev); 2647 finish_arch_switch(prev);
2648 perf_counter_task_sched_in(current, cpu_of(rq));
2604 finish_lock_switch(rq, prev); 2649 finish_lock_switch(rq, prev);
2605#ifdef CONFIG_SMP 2650#ifdef CONFIG_SMP
2606 if (current->sched_class->post_schedule) 2651 if (current->sched_class->post_schedule)
@@ -4129,6 +4174,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
4129 * Return any ns on the sched_clock that have not yet been banked in 4174 * Return any ns on the sched_clock that have not yet been banked in
4130 * @p in case that task is currently running. 4175 * @p in case that task is currently running.
4131 */ 4176 */
4177unsigned long long __task_delta_exec(struct task_struct *p, int update)
4178{
4179 s64 delta_exec;
4180 struct rq *rq;
4181
4182 rq = task_rq(p);
4183 WARN_ON_ONCE(!runqueue_is_locked());
4184 WARN_ON_ONCE(!task_current(rq, p));
4185
4186 if (update)
4187 update_rq_clock(rq);
4188
4189 delta_exec = rq->clock - p->se.exec_start;
4190
4191 WARN_ON_ONCE(delta_exec < 0);
4192
4193 return delta_exec;
4194}
4195
4196/*
4197 * Return any ns on the sched_clock that have not yet been banked in
4198 * @p in case that task is currently running.
4199 */
4132unsigned long long task_delta_exec(struct task_struct *p) 4200unsigned long long task_delta_exec(struct task_struct *p)
4133{ 4201{
4134 unsigned long flags; 4202 unsigned long flags;
@@ -4388,6 +4456,7 @@ void scheduler_tick(void)
4388 update_rq_clock(rq); 4456 update_rq_clock(rq);
4389 update_cpu_load(rq); 4457 update_cpu_load(rq);
4390 curr->sched_class->task_tick(rq, curr, 0); 4458 curr->sched_class->task_tick(rq, curr, 0);
4459 perf_counter_task_tick(curr, cpu);
4391 spin_unlock(&rq->lock); 4460 spin_unlock(&rq->lock);
4392 4461
4393#ifdef CONFIG_SMP 4462#ifdef CONFIG_SMP
@@ -4583,6 +4652,7 @@ need_resched_nonpreemptible:
4583 4652
4584 if (likely(prev != next)) { 4653 if (likely(prev != next)) {
4585 sched_info_switch(prev, next); 4654 sched_info_switch(prev, next);
4655 perf_counter_task_sched_out(prev, cpu);
4586 4656
4587 rq->nr_switches++; 4657 rq->nr_switches++;
4588 rq->curr = next; 4658 rq->curr = next;
diff --git a/kernel/sys.c b/kernel/sys.c
index 763c3c17ded3..c2a951ae4223 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1797,6 +1798,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1797 case PR_SET_TSC: 1798 case PR_SET_TSC:
1798 error = SET_TSC_CTL(arg2); 1799 error = SET_TSC_CTL(arg2);
1799 break; 1800 break;
1801 case PR_TASK_PERF_COUNTERS_DISABLE:
1802 error = perf_counter_task_disable();
1803 break;
1804 case PR_TASK_PERF_COUNTERS_ENABLE:
1805 error = perf_counter_task_enable();
1806 break;
1800 case PR_GET_TIMERSLACK: 1807 case PR_GET_TIMERSLACK:
1801 error = current->timer_slack_ns; 1808 error = current->timer_slack_ns;
1802 break; 1809 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a23281707..4be8bbc7577c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
174cond_syscall(compat_sys_timerfd_gettime); 174cond_syscall(compat_sys_timerfd_gettime);
175cond_syscall(sys_eventfd); 175cond_syscall(sys_eventfd);
176cond_syscall(sys_eventfd2); 176cond_syscall(sys_eventfd2);
177
178/* performance counters: */
179cond_syscall(sys_perf_counter_open);