aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf-counters.txt147
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h218
-rw-r--r--arch/x86/include/asm/hardirq_32.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h5
-rw-r--r--arch/x86/include/asm/pda.h1
-rw-r--r--arch/x86/include/asm/perf_counter.h95
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c695
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/oprofile/op_model_ppro.c2
-rw-r--r--drivers/acpi/processor_idle.c8
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c8
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/perf_counter.h257
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--init/Kconfig30
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/perf_counter.c1686
-rw-r--r--kernel/sched.c76
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
43 files changed, 3342 insertions, 50 deletions
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 000000000000..fddd32189a50
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,147 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those.
15
16Performance counters are accessed via special file descriptors.
17There's one file descriptor per virtual counter used.
18
19The special file descriptor is opened via the perf_counter_open()
20system call:
21
22 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
23 pid_t pid, int cpu, int group_fd);
24
25The syscall returns the new fd. The fd can be used via the normal
26VFS system calls: read() can be used to read the counter, fcntl()
27can be used to set the blocking mode, etc.
28
29Multiple counters can be kept open at a time, and the counters
30can be poll()ed.
31
32When creating a new counter fd, 'perf_counter_hw_event' is:
33
34/*
35 * Hardware event to monitor via a performance monitoring counter:
36 */
37struct perf_counter_hw_event {
38 s64 type;
39
40 u64 irq_period;
41 u32 record_type;
42
43 u32 disabled : 1, /* off by default */
44 nmi : 1, /* NMI sampling */
45 raw : 1, /* raw event type */
46 __reserved_1 : 29;
47
48 u64 __reserved_2;
49};
50
51/*
52 * Generalized performance counter event types, used by the hw_event.type
53 * parameter of the sys_perf_counter_open() syscall:
54 */
55enum hw_event_types {
56 /*
57 * Common hardware events, generalized by the kernel:
58 */
59 PERF_COUNT_CYCLES = 0,
60 PERF_COUNT_INSTRUCTIONS = 1,
61 PERF_COUNT_CACHE_REFERENCES = 2,
62 PERF_COUNT_CACHE_MISSES = 3,
63 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
64 PERF_COUNT_BRANCH_MISSES = 5,
65
66 /*
67 * Special "software" counters provided by the kernel, even if
68 * the hardware does not support performance counters. These
69 * counters measure various physical and sw events of the
70 * kernel (and allow the profiling of them as well):
71 */
72 PERF_COUNT_CPU_CLOCK = -1,
73 PERF_COUNT_TASK_CLOCK = -2,
74 /*
75 * Future software events:
76 */
77 /* PERF_COUNT_PAGE_FAULTS = -3,
78 PERF_COUNT_CONTEXT_SWITCHES = -4, */
79};
80
81These are standardized types of events that work uniformly on all CPUs
82that implements Performance Counters support under Linux. If a CPU is
83not able to count branch-misses, then the system call will return
84-EINVAL.
85
86More hw_event_types are supported as well, but they are CPU
87specific and are enumerated via /sys on a per CPU basis. Raw hw event
88types can be passed in under hw_event.type if hw_event.raw is 1.
89For example, to count "External bus cycles while bus lock signal asserted"
90events on Intel Core CPUs, pass in a 0x4064 event type value and set
91hw_event.raw to 1.
92
93'record_type' is the type of data that a read() will provide for the
94counter, and it can be one of:
95
96/*
97 * IRQ-notification data record type:
98 */
99enum perf_counter_record_type {
100 PERF_RECORD_SIMPLE = 0,
101 PERF_RECORD_IRQ = 1,
102 PERF_RECORD_GROUP = 2,
103};
104
105a "simple" counter is one that counts hardware events and allows
106them to be read out into a u64 count value. (read() returns 8 on
107a successful read of a simple counter.)
108
109An "irq" counter is one that will also provide an IRQ context information:
110the IP of the interrupted context. In this case read() will return
111the 8-byte counter value, plus the Instruction Pointer address of the
112interrupted context.
113
114The parameter 'hw_event_period' is the number of events before waking up
115a read() that is blocked on a counter fd. Zero value means a non-blocking
116counter.
117
118The 'pid' parameter allows the counter to be specific to a task:
119
120 pid == 0: if the pid parameter is zero, the counter is attached to the
121 current task.
122
123 pid > 0: the counter is attached to a specific task (if the current task
124 has sufficient privilege to do so)
125
126 pid < 0: all tasks are counted (per cpu counters)
127
128The 'cpu' parameter allows a counter to be made specific to a full
129CPU:
130
131 cpu >= 0: the counter is restricted to a specific CPU
132 cpu == -1: the counter counts on all CPUs
133
134(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
135
136A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
137events of that task and 'follows' that task to whatever CPU the task
138gets schedule to. Per task counters can be created by any user, for
139their own tasks.
140
141A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
142all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
143
144Group counters are created by passing in a group_fd of another counter.
145Groups are scheduled at once and can be used with PERF_RECORD_GROUP
146to record multi-dimensional timestamps.
147
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 98a0ed52b5c3..f3921028038c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -648,6 +648,7 @@ config X86_UP_IOAPIC
648config X86_LOCAL_APIC 648config X86_LOCAL_APIC
649 def_bool y 649 def_bool y
650 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 650 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
651 select HAVE_PERF_COUNTERS if (!M386 && !M486)
651 652
652config X86_IO_APIC 653config X86_IO_APIC
653 def_bool y 654 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..3c14ed07dc4e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
823 .quad compat_sys_signalfd4 823 .quad compat_sys_signalfd4
824 .quad sys_eventfd2 824 .quad sys_eventfd2
825 .quad sys_epoll_create1 825 .quad sys_epoll_create1
826 .quad sys_dup3 /* 330 */ 826 .quad sys_dup3 /* 330 */
827 .quad sys_pipe2 827 .quad sys_pipe2
828 .quad sys_inotify_init1 828 .quad sys_inotify_init1
829 .quad sys_perf_counter_open
829ia32_syscall_end: 830ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecddf..9927e01b03c2 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
255#define smp_mb__before_atomic_inc() barrier() 255#define smp_mb__before_atomic_inc() barrier()
256#define smp_mb__after_atomic_inc() barrier() 256#define smp_mb__after_atomic_inc() barrier()
257 257
258/* An 64bit atomic type */
259
260typedef struct {
261 unsigned long long counter;
262} atomic64_t;
263
264#define ATOMIC64_INIT(val) { (val) }
265
266/**
267 * atomic64_read - read atomic64 variable
268 * @v: pointer of type atomic64_t
269 *
270 * Atomically reads the value of @v.
271 * Doesn't imply a read memory barrier.
272 */
273#define __atomic64_read(ptr) ((ptr)->counter)
274
275static inline unsigned long long
276cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
277{
278 asm volatile(
279
280 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
281
282 : "=A" (old)
283
284 : [ptr] "D" (ptr),
285 "A" (old),
286 "b" (ll_low(new)),
287 "c" (ll_high(new))
288
289 : "memory");
290
291 return old;
292}
293
294static inline unsigned long long
295atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
296 unsigned long long new_val)
297{
298 return cmpxchg8b(&ptr->counter, old_val, new_val);
299}
300
301/**
302 * atomic64_set - set atomic64 variable
303 * @ptr: pointer to type atomic64_t
304 * @new_val: value to assign
305 *
306 * Atomically sets the value of @ptr to @new_val.
307 */
308static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
309{
310 unsigned long long old_val;
311
312 do {
313 old_val = atomic_read(ptr);
314 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
315}
316
317/**
318 * atomic64_read - read atomic64 variable
319 * @ptr: pointer to type atomic64_t
320 *
321 * Atomically reads the value of @ptr and returns it.
322 */
323static inline unsigned long long atomic64_read(atomic64_t *ptr)
324{
325 unsigned long long curr_val;
326
327 do {
328 curr_val = __atomic64_read(ptr);
329 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
330
331 return curr_val;
332}
333
334/**
335 * atomic64_add_return - add and return
336 * @delta: integer value to add
337 * @ptr: pointer to type atomic64_t
338 *
339 * Atomically adds @delta to @ptr and returns @delta + *@ptr
340 */
341static inline unsigned long long
342atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
343{
344 unsigned long long old_val, new_val;
345
346 do {
347 old_val = atomic_read(ptr);
348 new_val = old_val + delta;
349
350 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
351
352 return new_val;
353}
354
355static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
356{
357 return atomic64_add_return(-delta, ptr);
358}
359
360static inline long atomic64_inc_return(atomic64_t *ptr)
361{
362 return atomic64_add_return(1, ptr);
363}
364
365static inline long atomic64_dec_return(atomic64_t *ptr)
366{
367 return atomic64_sub_return(1, ptr);
368}
369
370/**
371 * atomic64_add - add integer to atomic64 variable
372 * @delta: integer value to add
373 * @ptr: pointer to type atomic64_t
374 *
375 * Atomically adds @delta to @ptr.
376 */
377static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
378{
379 atomic64_add_return(delta, ptr);
380}
381
382/**
383 * atomic64_sub - subtract the atomic64 variable
384 * @delta: integer value to subtract
385 * @ptr: pointer to type atomic64_t
386 *
387 * Atomically subtracts @delta from @ptr.
388 */
389static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
390{
391 atomic64_add(-delta, ptr);
392}
393
394/**
395 * atomic64_sub_and_test - subtract value from variable and test result
396 * @delta: integer value to subtract
397 * @ptr: pointer to type atomic64_t
398 *
399 * Atomically subtracts @delta from @ptr and returns
400 * true if the result is zero, or false for all
401 * other cases.
402 */
403static inline int
404atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
405{
406 unsigned long long old_val = atomic64_sub_return(delta, ptr);
407
408 return old_val == 0;
409}
410
411/**
412 * atomic64_inc - increment atomic64 variable
413 * @ptr: pointer to type atomic64_t
414 *
415 * Atomically increments @ptr by 1.
416 */
417static inline void atomic64_inc(atomic64_t *ptr)
418{
419 atomic64_add(1, ptr);
420}
421
422/**
423 * atomic64_dec - decrement atomic64 variable
424 * @ptr: pointer to type atomic64_t
425 *
426 * Atomically decrements @ptr by 1.
427 */
428static inline void atomic64_dec(atomic64_t *ptr)
429{
430 atomic64_sub(1, ptr);
431}
432
433/**
434 * atomic64_dec_and_test - decrement and test
435 * @ptr: pointer to type atomic64_t
436 *
437 * Atomically decrements @ptr by 1 and
438 * returns true if the result is 0, or false for all other
439 * cases.
440 */
441static inline int atomic64_dec_and_test(atomic64_t *ptr)
442{
443 return atomic64_sub_and_test(1, ptr);
444}
445
446/**
447 * atomic64_inc_and_test - increment and test
448 * @ptr: pointer to type atomic64_t
449 *
450 * Atomically increments @ptr by 1
451 * and returns true if the result is zero, or false for all
452 * other cases.
453 */
454static inline int atomic64_inc_and_test(atomic64_t *ptr)
455{
456 return atomic64_sub_and_test(-1, ptr);
457}
458
459/**
460 * atomic64_add_negative - add and test if negative
461 * @delta: integer value to add
462 * @ptr: pointer to type atomic64_t
463 *
464 * Atomically adds @delta to @ptr and returns true
465 * if the result is negative, or false when
466 * result is greater than or equal to zero.
467 */
468static inline int
469atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
470{
471 long long old_val = atomic64_add_return(delta, ptr);
472
473 return old_val < 0;
474}
475
258#include <asm-generic/atomic.h> 476#include <asm-generic/atomic.h>
259#endif /* _ASM_X86_ATOMIC_32_H */ 477#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index cf7954d1405f..7a07897a7888 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
9 unsigned long idle_timestamp; 9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */ 10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */ 11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int apic_perf_irqs; /* arch dependent */
12 unsigned int irq0_irqs; 13 unsigned int irq0_irqs;
13 unsigned int irq_resched_count; 14 unsigned int irq_resched_count;
14 unsigned int irq_call_count; 15 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..aa93e53b85ee 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
30/* Interrupt handlers registered during init_IRQ */ 30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void); 31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void); 32extern void error_interrupt(void);
33extern void perf_counter_interrupt(void);
34
33extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
34extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..b8d277f1252f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
87#define LOCAL_TIMER_VECTOR 0xef 87#define LOCAL_TIMER_VECTOR 0xef
88 88
89/* 89/*
90 * Performance monitoring interrupt vector:
91 */
92#define LOCAL_PERF_VECTOR 0xee
93
94/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we 95 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority 96 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector) 97 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..ad31e5d90e90 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
25 * a much simpler SMP time architecture: 25 * a much simpler SMP time architecture:
26 */ 26 */
27#ifdef CONFIG_X86_LOCAL_APIC 27#ifdef CONFIG_X86_LOCAL_APIC
28
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) 29BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 30BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 31BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31 32
33#ifdef CONFIG_PERF_COUNTERS
34BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
35#endif
36
32#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 38BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif 39#endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df37..90a8d9d4206b 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
30 short isidle; 30 short isidle;
31 struct mm_struct *active_mm; 31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs; 32 unsigned apic_timer_irqs;
33 unsigned apic_perf_irqs;
33 unsigned irq0_irqs; 34 unsigned irq0_irqs;
34 unsigned irq_resched_count; 35 unsigned irq_resched_count;
35 unsigned irq_call_count; 36 unsigned irq_call_count;
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..2e08ed736647
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(int nmi);
90#else
91static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(int nmi) { }
93#endif
94
95#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 98789647baa9..efdf93820aed 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
83#define TIF_SECCOMP 8 /* secure computing */ 83#define TIF_SECCOMP 8 /* secure computing */
84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
85#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
85#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 86#define TIF_NOTSC 16 /* TSC is not accessible in userland */
86#define TIF_IA32 17 /* 32bit process */ 87#define TIF_IA32 17 /* 32bit process */
87#define TIF_FORK 18 /* ret_from_fork */ 88#define TIF_FORK 18 /* ret_from_fork */
@@ -104,6 +105,7 @@ struct thread_info {
104#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 105#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
105#define _TIF_SECCOMP (1 << TIF_SECCOMP) 106#define _TIF_SECCOMP (1 << TIF_SECCOMP)
106#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 107#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
108#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
107#define _TIF_NOTSC (1 << TIF_NOTSC) 109#define _TIF_NOTSC (1 << TIF_NOTSC)
108#define _TIF_IA32 (1 << TIF_IA32) 110#define _TIF_IA32 (1 << TIF_IA32)
109#define _TIF_FORK (1 << TIF_FORK) 111#define _TIF_FORK (1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
135 137
136/* Only used for 64 bit */ 138/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \ 139#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 140 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
139 141
140/* flags to check in __switch_to() */ 142/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 143#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..7e47658b0a6f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
338#define __NR_dup3 330 338#define __NR_dup3 330
339#define __NR_pipe2 331 339#define __NR_pipe2 331
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_perf_counter_open 333
341 342
342#ifdef __KERNEL__ 343#ifdef __KERNEL__
343 344
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..53025feaf88d 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
653__SYSCALL(__NR_pipe2, sys_pipe2) 653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294 654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1) 655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656 656#define __NR_perf_counter_open 295
657__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
657 658
658#ifndef __NO_STUBS 659#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR 660#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index b5229affb953..6c83ac10e6d3 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -32,6 +32,7 @@
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h> 33#include <linux/ftrace.h>
34 34
35#include <asm/perf_counter.h>
35#include <asm/atomic.h> 36#include <asm/atomic.h>
36#include <asm/smp.h> 37#include <asm/smp.h>
37#include <asm/mtrr.h> 38#include <asm/mtrr.h>
@@ -1136,6 +1137,7 @@ void __cpuinit setup_local_APIC(void)
1136 apic_write(APIC_ESR, 0); 1137 apic_write(APIC_ESR, 0);
1137 } 1138 }
1138#endif 1139#endif
1140 perf_counters_lapic_init(0);
1139 1141
1140 preempt_disable(); 1142 preempt_disable();
1141 1143
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..c3813306e0b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -22,11 +22,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 24
25obj-$(CONFIG_X86_MCE) += mcheck/ 25obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
26obj-$(CONFIG_MTRR) += mtrr/
27obj-$(CONFIG_CPU_FREQ) += cpufreq/
28 26
29obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 27obj-$(CONFIG_X86_MCE) += mcheck/
28obj-$(CONFIG_MTRR) += mtrr/
29obj-$(CONFIG_CPU_FREQ) += cpufreq/
30
31obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
30 32
31quiet_cmd_mkcapflags = MKCAP $@ 33quiet_cmd_mkcapflags = MKCAP $@
32 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 34 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 42e0853030cb..376b9f9d8d23 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/mtrr.h> 18#include <asm/mtrr.h>
19#include <asm/mce.h> 19#include <asm/mce.h>
20#include <asm/perf_counter.h>
20#include <asm/pat.h> 21#include <asm/pat.h>
21#include <asm/asm.h> 22#include <asm/asm.h>
22#include <asm/numa.h> 23#include <asm/numa.h>
@@ -752,6 +753,7 @@ void __init identify_boot_cpu(void)
752#else 753#else
753 vgetcpu_set_mode(); 754 vgetcpu_set_mode();
754#endif 755#endif
756 init_hw_perf_counters();
755} 757}
756 758
757void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 759void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..9376771f757b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,695 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/perf_counter.h>
11#include <linux/capability.h>
12#include <linux/notifier.h>
13#include <linux/hardirq.h>
14#include <linux/kprobes.h>
15#include <linux/module.h>
16#include <linux/kdebug.h>
17#include <linux/sched.h>
18
19#include <asm/perf_counter.h>
20#include <asm/apic.h>
21
22static bool perf_counters_initialized __read_mostly;
23
24/*
25 * Number of (generic) HW counters:
26 */
27static int nr_counters_generic __read_mostly;
28static u64 perf_counter_mask __read_mostly;
29static u64 counter_value_mask __read_mostly;
30
31static int nr_counters_fixed __read_mostly;
32
33struct cpu_hw_counters {
34 struct perf_counter *counters[X86_PMC_IDX_MAX];
35 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
36};
37
38/*
39 * Intel PerfMon v3. Used on Core2 and later.
40 */
41static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
42
43static const int intel_perfmon_event_map[] =
44{
45 [PERF_COUNT_CPU_CYCLES] = 0x003c,
46 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
47 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
48 [PERF_COUNT_CACHE_MISSES] = 0x412e,
49 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
50 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
51 [PERF_COUNT_BUS_CYCLES] = 0x013c,
52};
53
54static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
55
56/*
57 * Propagate counter elapsed time into the generic counter.
58 * Can only be executed on the CPU where the counter is active.
59 * Returns the delta events processed.
60 */
61static void
62x86_perf_counter_update(struct perf_counter *counter,
63 struct hw_perf_counter *hwc, int idx)
64{
65 u64 prev_raw_count, new_raw_count, delta;
66
67 /*
68 * Careful: an NMI might modify the previous counter value.
69 *
70 * Our tactic to handle this is to first atomically read and
71 * exchange a new raw count - then add that new-prev delta
72 * count to the generic counter atomically:
73 */
74again:
75 prev_raw_count = atomic64_read(&hwc->prev_count);
76 rdmsrl(hwc->counter_base + idx, new_raw_count);
77
78 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
79 new_raw_count) != prev_raw_count)
80 goto again;
81
82 /*
83 * Now we have the new raw value and have updated the prev
84 * timestamp already. We can now calculate the elapsed delta
85 * (counter-)time and add that to the generic counter.
86 *
87 * Careful, not all hw sign-extends above the physical width
88 * of the count, so we do that by clipping the delta to 32 bits:
89 */
90 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
91
92 atomic64_add(delta, &counter->count);
93 atomic64_sub(delta, &hwc->period_left);
94}
95
96/*
97 * Setup the hardware configuration for a given hw_event_type
98 */
99static int __hw_perf_counter_init(struct perf_counter *counter)
100{
101 struct perf_counter_hw_event *hw_event = &counter->hw_event;
102 struct hw_perf_counter *hwc = &counter->hw;
103
104 if (unlikely(!perf_counters_initialized))
105 return -EINVAL;
106
107 /*
108 * Count user events, and generate PMC IRQs:
109 * (keep 'enabled' bit clear for now)
110 */
111 hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
112
113 /*
114 * If privileged enough, count OS events too, and allow
115 * NMI events as well:
116 */
117 hwc->nmi = 0;
118 if (capable(CAP_SYS_ADMIN)) {
119 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
120 if (hw_event->nmi)
121 hwc->nmi = 1;
122 }
123
124 hwc->irq_period = hw_event->irq_period;
125 /*
126 * Intel PMCs cannot be accessed sanely above 32 bit width,
127 * so we install an artificial 1<<31 period regardless of
128 * the generic counter period:
129 */
130 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
131 hwc->irq_period = 0x7FFFFFFF;
132
133 atomic64_set(&hwc->period_left, hwc->irq_period);
134
135 /*
136 * Raw event type provide the config in the event structure
137 */
138 if (hw_event->raw) {
139 hwc->config |= hw_event->type;
140 } else {
141 if (hw_event->type >= max_intel_perfmon_events)
142 return -EINVAL;
143 /*
144 * The generic map:
145 */
146 hwc->config |= intel_perfmon_event_map[hw_event->type];
147 }
148 counter->wakeup_pending = 0;
149
150 return 0;
151}
152
153u64 hw_perf_save_disable(void)
154{
155 u64 ctrl;
156
157 if (unlikely(!perf_counters_initialized))
158 return 0;
159
160 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
161 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
162
163 return ctrl;
164}
165EXPORT_SYMBOL_GPL(hw_perf_save_disable);
166
167void hw_perf_restore(u64 ctrl)
168{
169 if (unlikely(!perf_counters_initialized))
170 return;
171
172 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
173}
174EXPORT_SYMBOL_GPL(hw_perf_restore);
175
176static inline void
177__pmc_fixed_disable(struct perf_counter *counter,
178 struct hw_perf_counter *hwc, unsigned int __idx)
179{
180 int idx = __idx - X86_PMC_IDX_FIXED;
181 u64 ctrl_val, mask;
182 int err;
183
184 mask = 0xfULL << (idx * 4);
185
186 rdmsrl(hwc->config_base, ctrl_val);
187 ctrl_val &= ~mask;
188 err = checking_wrmsrl(hwc->config_base, ctrl_val);
189}
190
191static inline void
192__pmc_generic_disable(struct perf_counter *counter,
193 struct hw_perf_counter *hwc, unsigned int idx)
194{
195 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
196 __pmc_fixed_disable(counter, hwc, idx);
197 else
198 wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
199}
200
201static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
202
203/*
204 * Set the next IRQ period, based on the hwc->period_left value.
205 * To be called with the counter disabled in hw:
206 */
207static void
208__hw_perf_counter_set_period(struct perf_counter *counter,
209 struct hw_perf_counter *hwc, int idx)
210{
211 s64 left = atomic64_read(&hwc->period_left);
212 s32 period = hwc->irq_period;
213 int err;
214
215 /*
216 * If we are way outside a reasoable range then just skip forward:
217 */
218 if (unlikely(left <= -period)) {
219 left = period;
220 atomic64_set(&hwc->period_left, left);
221 }
222
223 if (unlikely(left <= 0)) {
224 left += period;
225 atomic64_set(&hwc->period_left, left);
226 }
227
228 per_cpu(prev_left[idx], smp_processor_id()) = left;
229
230 /*
231 * The hw counter starts counting from this counter offset,
232 * mark it to be able to extra future deltas:
233 */
234 atomic64_set(&hwc->prev_count, (u64)-left);
235
236 err = checking_wrmsrl(hwc->counter_base + idx,
237 (u64)(-left) & counter_value_mask);
238}
239
240static inline void
241__pmc_fixed_enable(struct perf_counter *counter,
242 struct hw_perf_counter *hwc, unsigned int __idx)
243{
244 int idx = __idx - X86_PMC_IDX_FIXED;
245 u64 ctrl_val, bits, mask;
246 int err;
247
248 /*
249 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
250 * and enable ring-0 counting if allowed:
251 */
252 bits = 0x8ULL | 0x2ULL;
253 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
254 bits |= 0x1;
255 bits <<= (idx * 4);
256 mask = 0xfULL << (idx * 4);
257
258 rdmsrl(hwc->config_base, ctrl_val);
259 ctrl_val &= ~mask;
260 ctrl_val |= bits;
261 err = checking_wrmsrl(hwc->config_base, ctrl_val);
262}
263
264static void
265__pmc_generic_enable(struct perf_counter *counter,
266 struct hw_perf_counter *hwc, int idx)
267{
268 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
269 __pmc_fixed_enable(counter, hwc, idx);
270 else
271 wrmsr(hwc->config_base + idx,
272 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
273}
274
275static int
276fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
277{
278 unsigned int event;
279
280 if (unlikely(hwc->nmi))
281 return -1;
282
283 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
284
285 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
286 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
287 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
288 return X86_PMC_IDX_FIXED_CPU_CYCLES;
289 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
290 return X86_PMC_IDX_FIXED_BUS_CYCLES;
291
292 return -1;
293}
294
295/*
296 * Find a PMC slot for the freshly enabled / scheduled in counter:
297 */
298static int pmc_generic_enable(struct perf_counter *counter)
299{
300 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
301 struct hw_perf_counter *hwc = &counter->hw;
302 int idx;
303
304 idx = fixed_mode_idx(counter, hwc);
305 if (idx >= 0) {
306 /*
307 * Try to get the fixed counter, if that is already taken
308 * then try to get a generic counter:
309 */
310 if (test_and_set_bit(idx, cpuc->used))
311 goto try_generic;
312
313 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
314 /*
315 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
316 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
317 */
318 hwc->counter_base =
319 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
320 hwc->idx = idx;
321 } else {
322 idx = hwc->idx;
323 /* Try to get the previous generic counter again */
324 if (test_and_set_bit(idx, cpuc->used)) {
325try_generic:
326 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
327 if (idx == nr_counters_generic)
328 return -EAGAIN;
329
330 set_bit(idx, cpuc->used);
331 hwc->idx = idx;
332 }
333 hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
334 hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
335 }
336
337 perf_counters_lapic_init(hwc->nmi);
338
339 __pmc_generic_disable(counter, hwc, idx);
340
341 cpuc->counters[idx] = counter;
342 /*
343 * Make it visible before enabling the hw:
344 */
345 smp_wmb();
346
347 __hw_perf_counter_set_period(counter, hwc, idx);
348 __pmc_generic_enable(counter, hwc, idx);
349
350 return 0;
351}
352
353void perf_counter_print_debug(void)
354{
355 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
356 struct cpu_hw_counters *cpuc;
357 int cpu, idx;
358
359 if (!nr_counters_generic)
360 return;
361
362 local_irq_disable();
363
364 cpu = smp_processor_id();
365 cpuc = &per_cpu(cpu_hw_counters, cpu);
366
367 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
368 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
369 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
370 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
371
372 printk(KERN_INFO "\n");
373 printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
374 printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
375 printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
376 printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed);
377 printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
378
379 for (idx = 0; idx < nr_counters_generic; idx++) {
380 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
381 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
382
383 prev_left = per_cpu(prev_left[idx], cpu);
384
385 printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n",
386 cpu, idx, pmc_ctrl);
387 printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n",
388 cpu, idx, pmc_count);
389 printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n",
390 cpu, idx, prev_left);
391 }
392 for (idx = 0; idx < nr_counters_fixed; idx++) {
393 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
394
395 printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
396 cpu, idx, pmc_count);
397 }
398 local_irq_enable();
399}
400
401static void pmc_generic_disable(struct perf_counter *counter)
402{
403 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
404 struct hw_perf_counter *hwc = &counter->hw;
405 unsigned int idx = hwc->idx;
406
407 __pmc_generic_disable(counter, hwc, idx);
408
409 clear_bit(idx, cpuc->used);
410 cpuc->counters[idx] = NULL;
411 /*
412 * Make sure the cleared pointer becomes visible before we
413 * (potentially) free the counter:
414 */
415 smp_wmb();
416
417 /*
418 * Drain the remaining delta count out of a counter
419 * that we are disabling:
420 */
421 x86_perf_counter_update(counter, hwc, idx);
422}
423
424static void perf_store_irq_data(struct perf_counter *counter, u64 data)
425{
426 struct perf_data *irqdata = counter->irqdata;
427
428 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
429 irqdata->overrun++;
430 } else {
431 u64 *p = (u64 *) &irqdata->data[irqdata->len];
432
433 *p = data;
434 irqdata->len += sizeof(u64);
435 }
436}
437
438/*
439 * Save and restart an expired counter. Called by NMI contexts,
440 * so it has to be careful about preempting normal counter ops:
441 */
442static void perf_save_and_restart(struct perf_counter *counter)
443{
444 struct hw_perf_counter *hwc = &counter->hw;
445 int idx = hwc->idx;
446
447 x86_perf_counter_update(counter, hwc, idx);
448 __hw_perf_counter_set_period(counter, hwc, idx);
449
450 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
451 __pmc_generic_enable(counter, hwc, idx);
452}
453
454static void
455perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
456{
457 struct perf_counter *counter, *group_leader = sibling->group_leader;
458
459 /*
460 * Store sibling timestamps (if any):
461 */
462 list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
463
464 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
465 perf_store_irq_data(sibling, counter->hw_event.type);
466 perf_store_irq_data(sibling, atomic64_read(&counter->count));
467 }
468}
469
470/*
471 * This handler is triggered by the local APIC, so the APIC IRQ handling
472 * rules apply:
473 */
474static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
475{
476 int bit, cpu = smp_processor_id();
477 u64 ack, status, saved_global;
478 struct cpu_hw_counters *cpuc;
479
480 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
481
482 /* Disable counters globally */
483 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
484 ack_APIC_irq();
485
486 cpuc = &per_cpu(cpu_hw_counters, cpu);
487
488 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
489 if (!status)
490 goto out;
491
492again:
493 ack = status;
494 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
495 struct perf_counter *counter = cpuc->counters[bit];
496
497 clear_bit(bit, (unsigned long *) &status);
498 if (!counter)
499 continue;
500
501 perf_save_and_restart(counter);
502
503 switch (counter->hw_event.record_type) {
504 case PERF_RECORD_SIMPLE:
505 continue;
506 case PERF_RECORD_IRQ:
507 perf_store_irq_data(counter, instruction_pointer(regs));
508 break;
509 case PERF_RECORD_GROUP:
510 perf_handle_group(counter, &status, &ack);
511 break;
512 }
513 /*
514 * From NMI context we cannot call into the scheduler to
515 * do a task wakeup - but we mark these generic as
516 * wakeup_pending and initate a wakeup callback:
517 */
518 if (nmi) {
519 counter->wakeup_pending = 1;
520 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
521 } else {
522 wake_up(&counter->waitq);
523 }
524 }
525
526 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
527
528 /*
529 * Repeat if there is more work to be done:
530 */
531 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
532 if (status)
533 goto again;
534out:
535 /*
536 * Restore - do not reenable when global enable is off:
537 */
538 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
539}
540
541void smp_perf_counter_interrupt(struct pt_regs *regs)
542{
543 irq_enter();
544 inc_irq_stat(apic_perf_irqs);
545 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
546 __smp_perf_counter_interrupt(regs, 0);
547
548 irq_exit();
549}
550
551/*
552 * This handler is triggered by NMI contexts:
553 */
554void perf_counter_notify(struct pt_regs *regs)
555{
556 struct cpu_hw_counters *cpuc;
557 unsigned long flags;
558 int bit, cpu;
559
560 local_irq_save(flags);
561 cpu = smp_processor_id();
562 cpuc = &per_cpu(cpu_hw_counters, cpu);
563
564 for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
565 struct perf_counter *counter = cpuc->counters[bit];
566
567 if (!counter)
568 continue;
569
570 if (counter->wakeup_pending) {
571 counter->wakeup_pending = 0;
572 wake_up(&counter->waitq);
573 }
574 }
575
576 local_irq_restore(flags);
577}
578
579void __cpuinit perf_counters_lapic_init(int nmi)
580{
581 u32 apic_val;
582
583 if (!perf_counters_initialized)
584 return;
585 /*
586 * Enable the performance counter vector in the APIC LVT:
587 */
588 apic_val = apic_read(APIC_LVTERR);
589
590 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
591 if (nmi)
592 apic_write(APIC_LVTPC, APIC_DM_NMI);
593 else
594 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
595 apic_write(APIC_LVTERR, apic_val);
596}
597
598static int __kprobes
599perf_counter_nmi_handler(struct notifier_block *self,
600 unsigned long cmd, void *__args)
601{
602 struct die_args *args = __args;
603 struct pt_regs *regs;
604
605 if (likely(cmd != DIE_NMI_IPI))
606 return NOTIFY_DONE;
607
608 regs = args->regs;
609
610 apic_write(APIC_LVTPC, APIC_DM_NMI);
611 __smp_perf_counter_interrupt(regs, 1);
612
613 return NOTIFY_STOP;
614}
615
616static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
617 .notifier_call = perf_counter_nmi_handler
618};
619
620void __init init_hw_perf_counters(void)
621{
622 union cpuid10_eax eax;
623 unsigned int ebx;
624 unsigned int unused;
625 union cpuid10_edx edx;
626
627 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
628 return;
629
630 /*
631 * Check whether the Architectural PerfMon supports
632 * Branch Misses Retired Event or not.
633 */
634 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
635 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
636 return;
637
638 printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
639
640 printk(KERN_INFO "... version: %d\n", eax.split.version_id);
641 printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
642 nr_counters_generic = eax.split.num_counters;
643 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
644 nr_counters_generic = X86_PMC_MAX_GENERIC;
645 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
646 nr_counters_generic, X86_PMC_MAX_GENERIC);
647 }
648 perf_counter_mask = (1 << nr_counters_generic) - 1;
649 perf_max_counters = nr_counters_generic;
650
651 printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
652 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
653 printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask);
654
655 printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
656
657 nr_counters_fixed = edx.split.num_counters_fixed;
658 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
659 nr_counters_fixed = X86_PMC_MAX_FIXED;
660 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
661 nr_counters_fixed, X86_PMC_MAX_FIXED);
662 }
663 printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed);
664
665 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
666
667 printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask);
668 perf_counters_initialized = true;
669
670 perf_counters_lapic_init(0);
671 register_die_notifier(&perf_counter_nmi_notifier);
672}
673
674static void pmc_generic_read(struct perf_counter *counter)
675{
676 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
677}
678
679static const struct hw_perf_counter_ops x86_perf_counter_ops = {
680 .enable = pmc_generic_enable,
681 .disable = pmc_generic_disable,
682 .read = pmc_generic_read,
683};
684
685const struct hw_perf_counter_ops *
686hw_perf_counter_init(struct perf_counter *counter)
687{
688 int err;
689
690 err = __hw_perf_counter_init(counter);
691 if (err)
692 return NULL;
693
694 return &x86_perf_counter_ops;
695}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e28c7a987793..1954a9662203 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1024,6 +1024,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1024apicinterrupt SPURIOUS_APIC_VECTOR \ 1024apicinterrupt SPURIOUS_APIC_VECTOR \
1025 spurious_interrupt smp_spurious_interrupt 1025 spurious_interrupt smp_spurious_interrupt
1026 1026
1027#ifdef CONFIG_PERF_COUNTERS
1028apicinterrupt LOCAL_PERF_VECTOR \
1029 perf_counter_interrupt smp_perf_counter_interrupt
1030#endif
1031
1027/* 1032/*
1028 * Exception entry points. 1033 * Exception entry points.
1029 */ 1034 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..d92bc71e41a7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
56 for_each_online_cpu(j) 56 for_each_online_cpu(j)
57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
58 seq_printf(p, " Local timer interrupts\n"); 58 seq_printf(p, " Local timer interrupts\n");
59 seq_printf(p, "CNT: ");
60 for_each_online_cpu(j)
61 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
62 seq_printf(p, " Performance counter interrupts\n");
59#endif 63#endif
60#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
61 seq_printf(p, "RES: "); 65 seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
160 164
161#ifdef CONFIG_X86_LOCAL_APIC 165#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs; 166 sum += irq_stats(cpu)->apic_timer_irqs;
167 sum += irq_stats(cpu)->apic_perf_irqs;
163#endif 168#endif
164#ifdef CONFIG_SMP 169#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count; 170 sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a5..6a33b5e30161 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
160 /* IPI vectors for APIC spurious and error interrupts */ 160 /* IPI vectors for APIC spurious and error interrupts */
161 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 161 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
162 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 162 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
163# ifdef CONFIG_PERF_COUNTERS
164 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
165# endif
163#endif 166#endif
164 167
165#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 168#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626e..91d785c25ad9 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
138 /* IPI vectors for APIC spurious and error interrupts */ 138 /* IPI vectors for APIC spurious and error interrupts */
139 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 139 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
140 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 140 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
141
142 /* Performance monitoring interrupt: */
143#ifdef CONFIG_PERF_COUNTERS
144 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
145#endif
141} 146}
142 147
143void __init native_init_IRQ(void) 148void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 89bb7668041d..4fa5243c2069 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9 9#include <linux/perf_counter.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
@@ -886,6 +886,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
886 tracehook_notify_resume(regs); 886 tracehook_notify_resume(regs);
887 } 887 }
888 888
889 if (thread_info_flags & _TIF_PERF_COUNTERS) {
890 clear_thread_flag(TIF_PERF_COUNTERS);
891 perf_counter_notify(regs);
892 }
893
889#ifdef CONFIG_X86_32 894#ifdef CONFIG_X86_32
890 clear_thread_flag(TIF_IRET); 895 clear_thread_flag(TIF_IRET);
891#endif /* CONFIG_X86_32 */ 896#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..496726ddcea1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf3..07c914555a5e 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 38aca048e951..f2a043131727 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,8 +270,11 @@ static atomic_t c3_cpu_count;
270/* Common C-state entry for C2, C3, .. */ 270/* Common C-state entry for C2, C3, .. */
271static void acpi_cstate_enter(struct acpi_processor_cx *cstate) 271static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
272{ 272{
273 u64 perf_flags;
274
273 /* Don't trace irqs off for idle */ 275 /* Don't trace irqs off for idle */
274 stop_critical_timings(); 276 stop_critical_timings();
277 perf_flags = hw_perf_save_disable();
275 if (cstate->entry_method == ACPI_CSTATE_FFH) { 278 if (cstate->entry_method == ACPI_CSTATE_FFH) {
276 /* Call into architectural FFH based C-state */ 279 /* Call into architectural FFH based C-state */
277 acpi_processor_ffh_cstate_enter(cstate); 280 acpi_processor_ffh_cstate_enter(cstate);
@@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
284 gets asserted in time to freeze execution properly. */ 287 gets asserted in time to freeze execution properly. */
285 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 288 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
286 } 289 }
290 hw_perf_restore(perf_flags);
287 start_critical_timings(); 291 start_critical_timings();
288} 292}
289#endif /* !CONFIG_CPU_IDLE */ 293#endif /* !CONFIG_CPU_IDLE */
@@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1425 */ 1429 */
1426static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 1430static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1427{ 1431{
1432 u64 pctrl;
1433
1428 /* Don't trace irqs off for idle */ 1434 /* Don't trace irqs off for idle */
1429 stop_critical_timings(); 1435 stop_critical_timings();
1436 pctrl = hw_perf_save_disable();
1430 if (cx->entry_method == ACPI_CSTATE_FFH) { 1437 if (cx->entry_method == ACPI_CSTATE_FFH) {
1431 /* Call into architectural FFH based C-state */ 1438 /* Call into architectural FFH based C-state */
1432 acpi_processor_ffh_cstate_enter(cx); 1439 acpi_processor_ffh_cstate_enter(cx);
@@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1441 gets asserted in time to freeze execution properly. */ 1448 gets asserted in time to freeze execution properly. */
1442 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 1449 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1443 } 1450 }
1451 hw_perf_restore(pctrl);
1444 start_critical_timings(); 1452 start_critical_timings();
1445} 1453}
1446 1454
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 94966edfb44d..9bcf0c9848b1 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 1f59ea079cbb..911dd0fd7e09 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -1019,6 +1020,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1019 1020
1020 current->personality &= ~bprm->per_clear; 1021 current->personality &= ~bprm->per_clear;
1021 1022
1023 /*
1024 * Flush performance counters when crossing a
1025 * security domain:
1026 */
1027 if (!get_dumpable(current->mm))
1028 perf_counter_exit_task(current);
1029
1022 /* An exec changes our domain. We are no longer part of the thread 1030 /* An exec changes our domain. We are no longer part of the thread
1023 group */ 1031 group */
1024 1032
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 959f5522d10a..d0e6cf3b201c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -114,6 +114,16 @@ extern struct group_info init_groups;
114 114
115extern struct cred init_cred; 115extern struct cred init_cred;
116 116
117#ifdef CONFIG_PERF_COUNTERS
118# define INIT_PERF_COUNTERS(tsk) \
119 .perf_counter_ctx.counter_list = \
120 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
121 .perf_counter_ctx.lock = \
122 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
123#else
124# define INIT_PERF_COUNTERS(tsk)
125#endif
126
117/* 127/*
118 * INIT_TASK is used to set up the first task table, touch at 128 * INIT_TASK is used to set up the first task table, touch at
119 * your own risk!. Base=0, limit=0x1fffff (=2MB) 129 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -178,6 +188,7 @@ extern struct cred init_cred;
178 INIT_IDS \ 188 INIT_IDS \
179 INIT_TRACE_IRQFLAGS \ 189 INIT_TRACE_IRQFLAGS \
180 INIT_LOCKDEP \ 190 INIT_LOCKDEP \
191 INIT_PERF_COUNTERS(tsk) \
181} 192}
182 193
183 194
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee07..1b2e3242497c 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
66 return sum; 66 return sum;
67} 67}
68 68
69
70/*
71 * Lock/unlock the current runqueue - to extract task statistics:
72 */
73extern void curr_rq_lock_irq_save(unsigned long *flags);
74extern void curr_rq_unlock_irq_restore(unsigned long *flags);
75extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
69extern unsigned long long task_delta_exec(struct task_struct *); 76extern unsigned long long task_delta_exec(struct task_struct *);
77
70extern void account_user_time(struct task_struct *, cputime_t); 78extern void account_user_time(struct task_struct *, cputime_t);
71extern void account_user_time_scaled(struct task_struct *, cputime_t); 79extern void account_user_time_scaled(struct task_struct *, cputime_t);
72extern void account_system_time(struct task_struct *, int, cputime_t); 80extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..cc3a75a239a9
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,257 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <asm/atomic.h>
17
18#ifdef CONFIG_PERF_COUNTERS
19# include <asm/perf_counter.h>
20#endif
21
22#include <linux/list.h>
23#include <linux/mutex.h>
24#include <linux/rculist.h>
25#include <linux/rcupdate.h>
26#include <linux/spinlock.h>
27
28struct task_struct;
29
30/*
31 * User-space ABI bits:
32 */
33
34/*
35 * Generalized performance counter event types, used by the hw_event.type
36 * parameter of the sys_perf_counter_open() syscall:
37 */
38enum hw_event_types {
39 /*
40 * Common hardware events, generalized by the kernel:
41 */
42 PERF_COUNT_CPU_CYCLES = 0,
43 PERF_COUNT_INSTRUCTIONS = 1,
44 PERF_COUNT_CACHE_REFERENCES = 2,
45 PERF_COUNT_CACHE_MISSES = 3,
46 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
47 PERF_COUNT_BRANCH_MISSES = 5,
48 PERF_COUNT_BUS_CYCLES = 6,
49
50 PERF_HW_EVENTS_MAX = 7,
51
52 /*
53 * Special "software" counters provided by the kernel, even if
54 * the hardware does not support performance counters. These
55 * counters measure various physical and sw events of the
56 * kernel (and allow the profiling of them as well):
57 */
58 PERF_COUNT_CPU_CLOCK = -1,
59 PERF_COUNT_TASK_CLOCK = -2,
60 PERF_COUNT_PAGE_FAULTS = -3,
61 PERF_COUNT_CONTEXT_SWITCHES = -4,
62 PERF_COUNT_CPU_MIGRATIONS = -5,
63
64 PERF_SW_EVENTS_MIN = -6,
65};
66
67/*
68 * IRQ-notification data record type:
69 */
70enum perf_counter_record_type {
71 PERF_RECORD_SIMPLE = 0,
72 PERF_RECORD_IRQ = 1,
73 PERF_RECORD_GROUP = 2,
74};
75
76/*
77 * Hardware event to monitor via a performance monitoring counter:
78 */
79struct perf_counter_hw_event {
80 s64 type;
81
82 u64 irq_period;
83 u32 record_type;
84
85 u32 disabled : 1, /* off by default */
86 nmi : 1, /* NMI sampling */
87 raw : 1, /* raw event type */
88 inherit : 1, /* children inherit it */
89 __reserved_1 : 28;
90
91 u64 __reserved_2;
92};
93
94/*
95 * Kernel-internal data types:
96 */
97
98/**
99 * struct hw_perf_counter - performance counter hardware details:
100 */
101struct hw_perf_counter {
102#ifdef CONFIG_PERF_COUNTERS
103 u64 config;
104 unsigned long config_base;
105 unsigned long counter_base;
106 int nmi;
107 unsigned int idx;
108 atomic64_t prev_count;
109 u64 irq_period;
110 atomic64_t period_left;
111#endif
112};
113
114/*
115 * Hardcoded buffer length limit for now, for IRQ-fed events:
116 */
117#define PERF_DATA_BUFLEN 2048
118
119/**
120 * struct perf_data - performance counter IRQ data sampling ...
121 */
122struct perf_data {
123 int len;
124 int rd_idx;
125 int overrun;
126 u8 data[PERF_DATA_BUFLEN];
127};
128
129struct perf_counter;
130
131/**
132 * struct hw_perf_counter_ops - performance counter hw ops
133 */
134struct hw_perf_counter_ops {
135 int (*enable) (struct perf_counter *counter);
136 void (*disable) (struct perf_counter *counter);
137 void (*read) (struct perf_counter *counter);
138};
139
140/**
141 * enum perf_counter_active_state - the states of a counter
142 */
143enum perf_counter_active_state {
144 PERF_COUNTER_STATE_OFF = -1,
145 PERF_COUNTER_STATE_INACTIVE = 0,
146 PERF_COUNTER_STATE_ACTIVE = 1,
147};
148
149struct file;
150
151/**
152 * struct perf_counter - performance counter kernel representation:
153 */
154struct perf_counter {
155#ifdef CONFIG_PERF_COUNTERS
156 struct list_head list_entry;
157 struct list_head sibling_list;
158 struct perf_counter *group_leader;
159 const struct hw_perf_counter_ops *hw_ops;
160
161 enum perf_counter_active_state state;
162 atomic64_t count;
163
164 struct perf_counter_hw_event hw_event;
165 struct hw_perf_counter hw;
166
167 struct perf_counter_context *ctx;
168 struct task_struct *task;
169 struct file *filp;
170
171 struct perf_counter *parent;
172 /*
173 * Protect attach/detach:
174 */
175 struct mutex mutex;
176
177 int oncpu;
178 int cpu;
179
180 /* read() / irq related data */
181 wait_queue_head_t waitq;
182 /* optional: for NMIs */
183 int wakeup_pending;
184 struct perf_data *irqdata;
185 struct perf_data *usrdata;
186 struct perf_data data[2];
187#endif
188};
189
190/**
191 * struct perf_counter_context - counter context structure
192 *
193 * Used as a container for task counters and CPU counters as well:
194 */
195struct perf_counter_context {
196#ifdef CONFIG_PERF_COUNTERS
197 /*
198 * Protect the list of counters:
199 */
200 spinlock_t lock;
201
202 struct list_head counter_list;
203 int nr_counters;
204 int nr_active;
205 struct task_struct *task;
206#endif
207};
208
209/**
210 * struct perf_counter_cpu_context - per cpu counter context structure
211 */
212struct perf_cpu_context {
213 struct perf_counter_context ctx;
214 struct perf_counter_context *task_ctx;
215 int active_oncpu;
216 int max_pertask;
217};
218
219/*
220 * Set by architecture code:
221 */
222extern int perf_max_counters;
223
224#ifdef CONFIG_PERF_COUNTERS
225extern const struct hw_perf_counter_ops *
226hw_perf_counter_init(struct perf_counter *counter);
227
228extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
229extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
230extern void perf_counter_task_tick(struct task_struct *task, int cpu);
231extern void perf_counter_init_task(struct task_struct *child);
232extern void perf_counter_exit_task(struct task_struct *child);
233extern void perf_counter_notify(struct pt_regs *regs);
234extern void perf_counter_print_debug(void);
235extern u64 hw_perf_save_disable(void);
236extern void hw_perf_restore(u64 ctrl);
237extern int perf_counter_task_disable(void);
238extern int perf_counter_task_enable(void);
239
240#else
241static inline void
242perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
243static inline void
244perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
245static inline void
246perf_counter_task_tick(struct task_struct *task, int cpu) { }
247static inline void perf_counter_init_task(struct task_struct *child) { }
248static inline void perf_counter_exit_task(struct task_struct *child) { }
249static inline void perf_counter_notify(struct pt_regs *regs) { }
250static inline void perf_counter_print_debug(void) { }
251static inline void hw_perf_restore(u64 ctrl) { }
252static inline u64 hw_perf_save_disable(void) { return 0; }
253static inline int perf_counter_task_disable(void) { return -EINVAL; }
254static inline int perf_counter_task_enable(void) { return -EINVAL; }
255#endif
256
257#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8395e715809d..fc2c6f3477e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -976,6 +977,8 @@ struct sched_entity {
976 u64 last_wakeup; 977 u64 last_wakeup;
977 u64 avg_overlap; 978 u64 avg_overlap;
978 979
980 u64 nr_migrations;
981
979#ifdef CONFIG_SCHEDSTATS 982#ifdef CONFIG_SCHEDSTATS
980 u64 wait_start; 983 u64 wait_start;
981 u64 wait_max; 984 u64 wait_max;
@@ -991,7 +994,6 @@ struct sched_entity {
991 u64 exec_max; 994 u64 exec_max;
992 u64 slice_max; 995 u64 slice_max;
993 996
994 u64 nr_migrations;
995 u64 nr_migrations_cold; 997 u64 nr_migrations_cold;
996 u64 nr_failed_migrations_affine; 998 u64 nr_failed_migrations_affine;
997 u64 nr_failed_migrations_running; 999 u64 nr_failed_migrations_running;
@@ -1294,6 +1296,7 @@ struct task_struct {
1294 struct list_head pi_state_list; 1296 struct list_head pi_state_list;
1295 struct futex_pi_state *pi_state_cache; 1297 struct futex_pi_state *pi_state_cache;
1296#endif 1298#endif
1299 struct perf_counter_context perf_counter_ctx;
1297#ifdef CONFIG_NUMA 1300#ifdef CONFIG_NUMA
1298 struct mempolicy *mempolicy; 1301 struct mempolicy *mempolicy;
1299 short il_next; 1302 short il_next;
@@ -2269,6 +2272,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2269#define TASK_SIZE_OF(tsk) TASK_SIZE 2272#define TASK_SIZE_OF(tsk) TASK_SIZE
2270#endif 2273#endif
2271 2274
2275/*
2276 * Call the function if the target task is executing on a CPU right now:
2277 */
2278extern void task_oncpu_function_call(struct task_struct *p,
2279 void (*func) (void *info), void *info);
2280
2281
2272#ifdef CONFIG_MM_OWNER 2282#ifdef CONFIG_MM_OWNER
2273extern void mm_update_next_owner(struct mm_struct *mm); 2283extern void mm_update_next_owner(struct mm_struct *mm);
2274extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2284extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 04fb47bfb920..a549678b7c3c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
54struct compat_timeval; 54struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct perf_counter_hw_event;
57 58
58#include <linux/types.h> 59#include <linux/types.h>
59#include <linux/aio_abi.h> 60#include <linux/aio_abi.h>
@@ -624,4 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
624 625
625int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 626int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
626 627
628
629asmlinkage int sys_perf_counter_open(
630
631 struct perf_counter_hw_event *hw_event_uptr __user,
632 pid_t pid,
633 int cpu,
634 int group_fd);
627#endif 635#endif
diff --git a/init/Kconfig b/init/Kconfig
index 8a63c404ef44..c38ae71a5e19 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -739,6 +739,36 @@ config AIO
739 by some high performance threaded applications. Disabling 739 by some high performance threaded applications. Disabling
740 this option saves about 7k. 740 this option saves about 7k.
741 741
742config HAVE_PERF_COUNTERS
743 bool
744
745menu "Performance Counters"
746
747config PERF_COUNTERS
748 bool "Kernel Performance Counters"
749 depends on HAVE_PERF_COUNTERS
750 default y
751 select ANON_INODES
752 help
753 Enable kernel support for performance counter hardware.
754
755 Performance counters are special hardware registers available
756 on most modern CPUs. These registers count the number of certain
757 types of hw events: such as instructions executed, cachemisses
758 suffered, or branches mis-predicted - without slowing down the
759 kernel or applications. These registers can also trigger interrupts
760 when a threshold number of events have passed - and can thus be
761 used to profile the code that runs on that CPU.
762
763 The Linux Performance Counter subsystem provides an abstraction of
764 these hardware capabilities, available via a system call. It
765 provides per task and per CPU counters, and it provides event
766 capabilities on top of those.
767
768 Say Y if unsure.
769
770endmenu
771
742config VM_EVENT_COUNTERS 772config VM_EVENT_COUNTERS
743 default y 773 default y
744 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 774 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 027edda63511..4476da868f86 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -88,6 +88,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
88obj-$(CONFIG_FUNCTION_TRACER) += trace/ 88obj-$(CONFIG_FUNCTION_TRACER) += trace/
89obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
90obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
91obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
91 92
92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 93ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca92038..ad8d04d83a2e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,6 +159,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
159{ 159{
160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
161 161
162#ifdef CONFIG_PERF_COUNTERS
163 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
164#endif
162 trace_sched_process_free(tsk); 165 trace_sched_process_free(tsk);
163 put_task_struct(tsk); 166 put_task_struct(tsk);
164} 167}
@@ -1102,10 +1105,6 @@ NORET_TYPE void do_exit(long code)
1102 tsk->mempolicy = NULL; 1105 tsk->mempolicy = NULL;
1103#endif 1106#endif
1104#ifdef CONFIG_FUTEX 1107#ifdef CONFIG_FUTEX
1105 /*
1106 * This must happen late, after the PID is not
1107 * hashed anymore:
1108 */
1109 if (unlikely(!list_empty(&tsk->pi_state_list))) 1108 if (unlikely(!list_empty(&tsk->pi_state_list)))
1110 exit_pi_state_list(tsk); 1109 exit_pi_state_list(tsk);
1111 if (unlikely(current->pi_state_cache)) 1110 if (unlikely(current->pi_state_cache))
@@ -1370,6 +1369,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1370 */ 1369 */
1371 read_unlock(&tasklist_lock); 1370 read_unlock(&tasklist_lock);
1372 1371
1372 /*
1373 * Flush inherited counters to the parent - before the parent
1374 * gets woken up by child-exit notifications.
1375 */
1376 perf_counter_exit_task(p);
1377
1373 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1378 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1374 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1379 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1375 ? p->signal->group_exit_code : p->exit_code; 1380 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6144b36cd897..cb706599057f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -974,6 +974,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
974 goto fork_out; 974 goto fork_out;
975 975
976 rt_mutex_init_task(p); 976 rt_mutex_init_task(p);
977 perf_counter_init_task(p);
977 978
978#ifdef CONFIG_PROVE_LOCKING 979#ifdef CONFIG_PROVE_LOCKING
979 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 980 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..37f771691f93
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,1686 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/file.h>
14#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/kernel_stat.h>
22#include <linux/perf_counter.h>
23
24/*
25 * Each CPU has a list of per CPU counters:
26 */
27DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
28
29int perf_max_counters __read_mostly = 1;
30static int perf_reserved_percpu __read_mostly;
31static int perf_overcommit __read_mostly = 1;
32
33/*
34 * Mutex for (sysadmin-configurable) counter reservations:
35 */
36static DEFINE_MUTEX(perf_resource_mutex);
37
38/*
39 * Architecture provided APIs - weak aliases:
40 */
41extern __weak const struct hw_perf_counter_ops *
42hw_perf_counter_init(struct perf_counter *counter)
43{
44 return ERR_PTR(-EINVAL);
45}
46
47u64 __weak hw_perf_save_disable(void) { return 0; }
48void __weak hw_perf_restore(u64 ctrl) { barrier(); }
49void __weak hw_perf_counter_setup(void) { barrier(); }
50
51static void
52list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
53{
54 struct perf_counter *group_leader = counter->group_leader;
55
56 /*
57 * Depending on whether it is a standalone or sibling counter,
58 * add it straight to the context's counter list, or to the group
59 * leader's sibling list:
60 */
61 if (counter->group_leader == counter)
62 list_add_tail(&counter->list_entry, &ctx->counter_list);
63 else
64 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
65}
66
67static void
68list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
69{
70 struct perf_counter *sibling, *tmp;
71
72 list_del_init(&counter->list_entry);
73
74 /*
75 * If this was a group counter with sibling counters then
76 * upgrade the siblings to singleton counters by adding them
77 * to the context list directly:
78 */
79 list_for_each_entry_safe(sibling, tmp,
80 &counter->sibling_list, list_entry) {
81
82 list_del_init(&sibling->list_entry);
83 list_add_tail(&sibling->list_entry, &ctx->counter_list);
84 sibling->group_leader = sibling;
85 }
86}
87
88/*
89 * Cross CPU call to remove a performance counter
90 *
91 * We disable the counter on the hardware level first. After that we
92 * remove it from the context list.
93 */
94static void __perf_counter_remove_from_context(void *info)
95{
96 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
97 struct perf_counter *counter = info;
98 struct perf_counter_context *ctx = counter->ctx;
99 unsigned long flags;
100 u64 perf_flags;
101
102 /*
103 * If this is a task context, we need to check whether it is
104 * the current task context of this cpu. If not it has been
105 * scheduled out before the smp call arrived.
106 */
107 if (ctx->task && cpuctx->task_ctx != ctx)
108 return;
109
110 curr_rq_lock_irq_save(&flags);
111 spin_lock(&ctx->lock);
112
113 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
114 counter->state = PERF_COUNTER_STATE_INACTIVE;
115 counter->hw_ops->disable(counter);
116 ctx->nr_active--;
117 cpuctx->active_oncpu--;
118 counter->task = NULL;
119 counter->oncpu = -1;
120 }
121 ctx->nr_counters--;
122
123 /*
124 * Protect the list operation against NMI by disabling the
125 * counters on a global level. NOP for non NMI based counters.
126 */
127 perf_flags = hw_perf_save_disable();
128 list_del_counter(counter, ctx);
129 hw_perf_restore(perf_flags);
130
131 if (!ctx->task) {
132 /*
133 * Allow more per task counters with respect to the
134 * reservation:
135 */
136 cpuctx->max_pertask =
137 min(perf_max_counters - ctx->nr_counters,
138 perf_max_counters - perf_reserved_percpu);
139 }
140
141 spin_unlock(&ctx->lock);
142 curr_rq_unlock_irq_restore(&flags);
143}
144
145
146/*
147 * Remove the counter from a task's (or a CPU's) list of counters.
148 *
149 * Must be called with counter->mutex held.
150 *
151 * CPU counters are removed with a smp call. For task counters we only
152 * call when the task is on a CPU.
153 */
154static void perf_counter_remove_from_context(struct perf_counter *counter)
155{
156 struct perf_counter_context *ctx = counter->ctx;
157 struct task_struct *task = ctx->task;
158
159 if (!task) {
160 /*
161 * Per cpu counters are removed via an smp call and
162 * the removal is always sucessful.
163 */
164 smp_call_function_single(counter->cpu,
165 __perf_counter_remove_from_context,
166 counter, 1);
167 return;
168 }
169
170retry:
171 task_oncpu_function_call(task, __perf_counter_remove_from_context,
172 counter);
173
174 spin_lock_irq(&ctx->lock);
175 /*
176 * If the context is active we need to retry the smp call.
177 */
178 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
179 spin_unlock_irq(&ctx->lock);
180 goto retry;
181 }
182
183 /*
184 * The lock prevents that this context is scheduled in so we
185 * can remove the counter safely, if the call above did not
186 * succeed.
187 */
188 if (!list_empty(&counter->list_entry)) {
189 ctx->nr_counters--;
190 list_del_counter(counter, ctx);
191 counter->task = NULL;
192 }
193 spin_unlock_irq(&ctx->lock);
194}
195
196static int
197counter_sched_in(struct perf_counter *counter,
198 struct perf_cpu_context *cpuctx,
199 struct perf_counter_context *ctx,
200 int cpu)
201{
202 if (counter->state == PERF_COUNTER_STATE_OFF)
203 return 0;
204
205 counter->state = PERF_COUNTER_STATE_ACTIVE;
206 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
207 /*
208 * The new state must be visible before we turn it on in the hardware:
209 */
210 smp_wmb();
211
212 if (counter->hw_ops->enable(counter)) {
213 counter->state = PERF_COUNTER_STATE_INACTIVE;
214 counter->oncpu = -1;
215 return -EAGAIN;
216 }
217
218 cpuctx->active_oncpu++;
219 ctx->nr_active++;
220
221 return 0;
222}
223
224/*
225 * Cross CPU call to install and enable a performance counter
226 */
227static void __perf_install_in_context(void *info)
228{
229 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
230 struct perf_counter *counter = info;
231 struct perf_counter_context *ctx = counter->ctx;
232 int cpu = smp_processor_id();
233 unsigned long flags;
234 u64 perf_flags;
235
236 /*
237 * If this is a task context, we need to check whether it is
238 * the current task context of this cpu. If not it has been
239 * scheduled out before the smp call arrived.
240 */
241 if (ctx->task && cpuctx->task_ctx != ctx)
242 return;
243
244 curr_rq_lock_irq_save(&flags);
245 spin_lock(&ctx->lock);
246
247 /*
248 * Protect the list operation against NMI by disabling the
249 * counters on a global level. NOP for non NMI based counters.
250 */
251 perf_flags = hw_perf_save_disable();
252
253 list_add_counter(counter, ctx);
254 ctx->nr_counters++;
255
256 counter_sched_in(counter, cpuctx, ctx, cpu);
257
258 if (!ctx->task && cpuctx->max_pertask)
259 cpuctx->max_pertask--;
260
261 hw_perf_restore(perf_flags);
262
263 spin_unlock(&ctx->lock);
264 curr_rq_unlock_irq_restore(&flags);
265}
266
267/*
268 * Attach a performance counter to a context
269 *
270 * First we add the counter to the list with the hardware enable bit
271 * in counter->hw_config cleared.
272 *
273 * If the counter is attached to a task which is on a CPU we use a smp
274 * call to enable it in the task context. The task might have been
275 * scheduled away, but we check this in the smp call again.
276 */
277static void
278perf_install_in_context(struct perf_counter_context *ctx,
279 struct perf_counter *counter,
280 int cpu)
281{
282 struct task_struct *task = ctx->task;
283
284 counter->ctx = ctx;
285 if (!task) {
286 /*
287 * Per cpu counters are installed via an smp call and
288 * the install is always sucessful.
289 */
290 smp_call_function_single(cpu, __perf_install_in_context,
291 counter, 1);
292 return;
293 }
294
295 counter->task = task;
296retry:
297 task_oncpu_function_call(task, __perf_install_in_context,
298 counter);
299
300 spin_lock_irq(&ctx->lock);
301 /*
302 * we need to retry the smp call.
303 */
304 if (ctx->nr_active && list_empty(&counter->list_entry)) {
305 spin_unlock_irq(&ctx->lock);
306 goto retry;
307 }
308
309 /*
310 * The lock prevents that this context is scheduled in so we
311 * can add the counter safely, if it the call above did not
312 * succeed.
313 */
314 if (list_empty(&counter->list_entry)) {
315 list_add_counter(counter, ctx);
316 ctx->nr_counters++;
317 }
318 spin_unlock_irq(&ctx->lock);
319}
320
321static void
322counter_sched_out(struct perf_counter *counter,
323 struct perf_cpu_context *cpuctx,
324 struct perf_counter_context *ctx)
325{
326 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
327 return;
328
329 counter->state = PERF_COUNTER_STATE_INACTIVE;
330 counter->hw_ops->disable(counter);
331 counter->oncpu = -1;
332
333 cpuctx->active_oncpu--;
334 ctx->nr_active--;
335}
336
337static void
338group_sched_out(struct perf_counter *group_counter,
339 struct perf_cpu_context *cpuctx,
340 struct perf_counter_context *ctx)
341{
342 struct perf_counter *counter;
343
344 counter_sched_out(group_counter, cpuctx, ctx);
345
346 /*
347 * Schedule out siblings (if any):
348 */
349 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
350 counter_sched_out(counter, cpuctx, ctx);
351}
352
353void __perf_counter_sched_out(struct perf_counter_context *ctx,
354 struct perf_cpu_context *cpuctx)
355{
356 struct perf_counter *counter;
357
358 if (likely(!ctx->nr_counters))
359 return;
360
361 spin_lock(&ctx->lock);
362 if (ctx->nr_active) {
363 list_for_each_entry(counter, &ctx->counter_list, list_entry)
364 group_sched_out(counter, cpuctx, ctx);
365 }
366 spin_unlock(&ctx->lock);
367}
368
369/*
370 * Called from scheduler to remove the counters of the current task,
371 * with interrupts disabled.
372 *
373 * We stop each counter and update the counter value in counter->count.
374 *
375 * This does not protect us against NMI, but disable()
376 * sets the disabled bit in the control field of counter _before_
377 * accessing the counter control register. If a NMI hits, then it will
378 * not restart the counter.
379 */
380void perf_counter_task_sched_out(struct task_struct *task, int cpu)
381{
382 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
383 struct perf_counter_context *ctx = &task->perf_counter_ctx;
384
385 if (likely(!cpuctx->task_ctx))
386 return;
387
388 __perf_counter_sched_out(ctx, cpuctx);
389
390 cpuctx->task_ctx = NULL;
391}
392
393static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
394{
395 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
396}
397
398static int
399group_sched_in(struct perf_counter *group_counter,
400 struct perf_cpu_context *cpuctx,
401 struct perf_counter_context *ctx,
402 int cpu)
403{
404 struct perf_counter *counter, *partial_group;
405 int ret = 0;
406
407 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
408 return -EAGAIN;
409
410 /*
411 * Schedule in siblings as one group (if any):
412 */
413 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
414 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
415 partial_group = counter;
416 goto group_error;
417 }
418 ret = -EAGAIN;
419 }
420
421 return ret;
422
423group_error:
424 /*
425 * Groups can be scheduled in as one unit only, so undo any
426 * partial group before returning:
427 */
428 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
429 if (counter == partial_group)
430 break;
431 counter_sched_out(counter, cpuctx, ctx);
432 }
433 counter_sched_out(group_counter, cpuctx, ctx);
434
435 return -EAGAIN;
436}
437
438static void
439__perf_counter_sched_in(struct perf_counter_context *ctx,
440 struct perf_cpu_context *cpuctx, int cpu)
441{
442 struct perf_counter *counter;
443
444 if (likely(!ctx->nr_counters))
445 return;
446
447 spin_lock(&ctx->lock);
448 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
449 /*
450 * Listen to the 'cpu' scheduling filter constraint
451 * of counters:
452 */
453 if (counter->cpu != -1 && counter->cpu != cpu)
454 continue;
455
456 /*
457 * If we scheduled in a group atomically and
458 * exclusively, break out:
459 */
460 if (group_sched_in(counter, cpuctx, ctx, cpu))
461 break;
462 }
463 spin_unlock(&ctx->lock);
464}
465
466/*
467 * Called from scheduler to add the counters of the current task
468 * with interrupts disabled.
469 *
470 * We restore the counter value and then enable it.
471 *
472 * This does not protect us against NMI, but enable()
473 * sets the enabled bit in the control field of counter _before_
474 * accessing the counter control register. If a NMI hits, then it will
475 * keep the counter running.
476 */
477void perf_counter_task_sched_in(struct task_struct *task, int cpu)
478{
479 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
480 struct perf_counter_context *ctx = &task->perf_counter_ctx;
481
482 __perf_counter_sched_in(ctx, cpuctx, cpu);
483 cpuctx->task_ctx = ctx;
484}
485
486static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
487{
488 struct perf_counter_context *ctx = &cpuctx->ctx;
489
490 __perf_counter_sched_in(ctx, cpuctx, cpu);
491}
492
493int perf_counter_task_disable(void)
494{
495 struct task_struct *curr = current;
496 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
497 struct perf_counter *counter;
498 unsigned long flags;
499 u64 perf_flags;
500 int cpu;
501
502 if (likely(!ctx->nr_counters))
503 return 0;
504
505 curr_rq_lock_irq_save(&flags);
506 cpu = smp_processor_id();
507
508 /* force the update of the task clock: */
509 __task_delta_exec(curr, 1);
510
511 perf_counter_task_sched_out(curr, cpu);
512
513 spin_lock(&ctx->lock);
514
515 /*
516 * Disable all the counters:
517 */
518 perf_flags = hw_perf_save_disable();
519
520 list_for_each_entry(counter, &ctx->counter_list, list_entry)
521 counter->state = PERF_COUNTER_STATE_OFF;
522
523 hw_perf_restore(perf_flags);
524
525 spin_unlock(&ctx->lock);
526
527 curr_rq_unlock_irq_restore(&flags);
528
529 return 0;
530}
531
532int perf_counter_task_enable(void)
533{
534 struct task_struct *curr = current;
535 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
536 struct perf_counter *counter;
537 unsigned long flags;
538 u64 perf_flags;
539 int cpu;
540
541 if (likely(!ctx->nr_counters))
542 return 0;
543
544 curr_rq_lock_irq_save(&flags);
545 cpu = smp_processor_id();
546
547 /* force the update of the task clock: */
548 __task_delta_exec(curr, 1);
549
550 perf_counter_task_sched_out(curr, cpu);
551
552 spin_lock(&ctx->lock);
553
554 /*
555 * Disable all the counters:
556 */
557 perf_flags = hw_perf_save_disable();
558
559 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
560 if (counter->state != PERF_COUNTER_STATE_OFF)
561 continue;
562 counter->state = PERF_COUNTER_STATE_INACTIVE;
563 counter->hw_event.disabled = 0;
564 }
565 hw_perf_restore(perf_flags);
566
567 spin_unlock(&ctx->lock);
568
569 perf_counter_task_sched_in(curr, cpu);
570
571 curr_rq_unlock_irq_restore(&flags);
572
573 return 0;
574}
575
576/*
577 * Round-robin a context's counters:
578 */
579static void rotate_ctx(struct perf_counter_context *ctx)
580{
581 struct perf_counter *counter;
582 u64 perf_flags;
583
584 if (!ctx->nr_counters)
585 return;
586
587 spin_lock(&ctx->lock);
588 /*
589 * Rotate the first entry last (works just fine for group counters too):
590 */
591 perf_flags = hw_perf_save_disable();
592 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
593 list_del(&counter->list_entry);
594 list_add_tail(&counter->list_entry, &ctx->counter_list);
595 break;
596 }
597 hw_perf_restore(perf_flags);
598
599 spin_unlock(&ctx->lock);
600}
601
602void perf_counter_task_tick(struct task_struct *curr, int cpu)
603{
604 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
605 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
606 const int rotate_percpu = 0;
607
608 if (rotate_percpu)
609 perf_counter_cpu_sched_out(cpuctx);
610 perf_counter_task_sched_out(curr, cpu);
611
612 if (rotate_percpu)
613 rotate_ctx(&cpuctx->ctx);
614 rotate_ctx(ctx);
615
616 if (rotate_percpu)
617 perf_counter_cpu_sched_in(cpuctx, cpu);
618 perf_counter_task_sched_in(curr, cpu);
619}
620
621/*
622 * Cross CPU call to read the hardware counter
623 */
624static void __read(void *info)
625{
626 struct perf_counter *counter = info;
627 unsigned long flags;
628
629 curr_rq_lock_irq_save(&flags);
630 counter->hw_ops->read(counter);
631 curr_rq_unlock_irq_restore(&flags);
632}
633
634static u64 perf_counter_read(struct perf_counter *counter)
635{
636 /*
637 * If counter is enabled and currently active on a CPU, update the
638 * value in the counter structure:
639 */
640 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
641 smp_call_function_single(counter->oncpu,
642 __read, counter, 1);
643 }
644
645 return atomic64_read(&counter->count);
646}
647
648/*
649 * Cross CPU call to switch performance data pointers
650 */
651static void __perf_switch_irq_data(void *info)
652{
653 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
654 struct perf_counter *counter = info;
655 struct perf_counter_context *ctx = counter->ctx;
656 struct perf_data *oldirqdata = counter->irqdata;
657
658 /*
659 * If this is a task context, we need to check whether it is
660 * the current task context of this cpu. If not it has been
661 * scheduled out before the smp call arrived.
662 */
663 if (ctx->task) {
664 if (cpuctx->task_ctx != ctx)
665 return;
666 spin_lock(&ctx->lock);
667 }
668
669 /* Change the pointer NMI safe */
670 atomic_long_set((atomic_long_t *)&counter->irqdata,
671 (unsigned long) counter->usrdata);
672 counter->usrdata = oldirqdata;
673
674 if (ctx->task)
675 spin_unlock(&ctx->lock);
676}
677
678static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
679{
680 struct perf_counter_context *ctx = counter->ctx;
681 struct perf_data *oldirqdata = counter->irqdata;
682 struct task_struct *task = ctx->task;
683
684 if (!task) {
685 smp_call_function_single(counter->cpu,
686 __perf_switch_irq_data,
687 counter, 1);
688 return counter->usrdata;
689 }
690
691retry:
692 spin_lock_irq(&ctx->lock);
693 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
694 counter->irqdata = counter->usrdata;
695 counter->usrdata = oldirqdata;
696 spin_unlock_irq(&ctx->lock);
697 return oldirqdata;
698 }
699 spin_unlock_irq(&ctx->lock);
700 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
701 /* Might have failed, because task was scheduled out */
702 if (counter->irqdata == oldirqdata)
703 goto retry;
704
705 return counter->usrdata;
706}
707
708static void put_context(struct perf_counter_context *ctx)
709{
710 if (ctx->task)
711 put_task_struct(ctx->task);
712}
713
714static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
715{
716 struct perf_cpu_context *cpuctx;
717 struct perf_counter_context *ctx;
718 struct task_struct *task;
719
720 /*
721 * If cpu is not a wildcard then this is a percpu counter:
722 */
723 if (cpu != -1) {
724 /* Must be root to operate on a CPU counter: */
725 if (!capable(CAP_SYS_ADMIN))
726 return ERR_PTR(-EACCES);
727
728 if (cpu < 0 || cpu > num_possible_cpus())
729 return ERR_PTR(-EINVAL);
730
731 /*
732 * We could be clever and allow to attach a counter to an
733 * offline CPU and activate it when the CPU comes up, but
734 * that's for later.
735 */
736 if (!cpu_isset(cpu, cpu_online_map))
737 return ERR_PTR(-ENODEV);
738
739 cpuctx = &per_cpu(perf_cpu_context, cpu);
740 ctx = &cpuctx->ctx;
741
742 return ctx;
743 }
744
745 rcu_read_lock();
746 if (!pid)
747 task = current;
748 else
749 task = find_task_by_vpid(pid);
750 if (task)
751 get_task_struct(task);
752 rcu_read_unlock();
753
754 if (!task)
755 return ERR_PTR(-ESRCH);
756
757 ctx = &task->perf_counter_ctx;
758 ctx->task = task;
759
760 /* Reuse ptrace permission checks for now. */
761 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
762 put_context(ctx);
763 return ERR_PTR(-EACCES);
764 }
765
766 return ctx;
767}
768
769/*
770 * Called when the last reference to the file is gone.
771 */
772static int perf_release(struct inode *inode, struct file *file)
773{
774 struct perf_counter *counter = file->private_data;
775 struct perf_counter_context *ctx = counter->ctx;
776
777 file->private_data = NULL;
778
779 mutex_lock(&counter->mutex);
780
781 perf_counter_remove_from_context(counter);
782 put_context(ctx);
783
784 mutex_unlock(&counter->mutex);
785
786 kfree(counter);
787
788 return 0;
789}
790
791/*
792 * Read the performance counter - simple non blocking version for now
793 */
794static ssize_t
795perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
796{
797 u64 cntval;
798
799 if (count != sizeof(cntval))
800 return -EINVAL;
801
802 mutex_lock(&counter->mutex);
803 cntval = perf_counter_read(counter);
804 mutex_unlock(&counter->mutex);
805
806 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
807}
808
809static ssize_t
810perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
811{
812 if (!usrdata->len)
813 return 0;
814
815 count = min(count, (size_t)usrdata->len);
816 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
817 return -EFAULT;
818
819 /* Adjust the counters */
820 usrdata->len -= count;
821 if (!usrdata->len)
822 usrdata->rd_idx = 0;
823 else
824 usrdata->rd_idx += count;
825
826 return count;
827}
828
829static ssize_t
830perf_read_irq_data(struct perf_counter *counter,
831 char __user *buf,
832 size_t count,
833 int nonblocking)
834{
835 struct perf_data *irqdata, *usrdata;
836 DECLARE_WAITQUEUE(wait, current);
837 ssize_t res;
838
839 irqdata = counter->irqdata;
840 usrdata = counter->usrdata;
841
842 if (usrdata->len + irqdata->len >= count)
843 goto read_pending;
844
845 if (nonblocking)
846 return -EAGAIN;
847
848 spin_lock_irq(&counter->waitq.lock);
849 __add_wait_queue(&counter->waitq, &wait);
850 for (;;) {
851 set_current_state(TASK_INTERRUPTIBLE);
852 if (usrdata->len + irqdata->len >= count)
853 break;
854
855 if (signal_pending(current))
856 break;
857
858 spin_unlock_irq(&counter->waitq.lock);
859 schedule();
860 spin_lock_irq(&counter->waitq.lock);
861 }
862 __remove_wait_queue(&counter->waitq, &wait);
863 __set_current_state(TASK_RUNNING);
864 spin_unlock_irq(&counter->waitq.lock);
865
866 if (usrdata->len + irqdata->len < count)
867 return -ERESTARTSYS;
868read_pending:
869 mutex_lock(&counter->mutex);
870
871 /* Drain pending data first: */
872 res = perf_copy_usrdata(usrdata, buf, count);
873 if (res < 0 || res == count)
874 goto out;
875
876 /* Switch irq buffer: */
877 usrdata = perf_switch_irq_data(counter);
878 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
879 if (!res)
880 res = -EFAULT;
881 } else {
882 res = count;
883 }
884out:
885 mutex_unlock(&counter->mutex);
886
887 return res;
888}
889
890static ssize_t
891perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
892{
893 struct perf_counter *counter = file->private_data;
894
895 switch (counter->hw_event.record_type) {
896 case PERF_RECORD_SIMPLE:
897 return perf_read_hw(counter, buf, count);
898
899 case PERF_RECORD_IRQ:
900 case PERF_RECORD_GROUP:
901 return perf_read_irq_data(counter, buf, count,
902 file->f_flags & O_NONBLOCK);
903 }
904 return -EINVAL;
905}
906
907static unsigned int perf_poll(struct file *file, poll_table *wait)
908{
909 struct perf_counter *counter = file->private_data;
910 unsigned int events = 0;
911 unsigned long flags;
912
913 poll_wait(file, &counter->waitq, wait);
914
915 spin_lock_irqsave(&counter->waitq.lock, flags);
916 if (counter->usrdata->len || counter->irqdata->len)
917 events |= POLLIN;
918 spin_unlock_irqrestore(&counter->waitq.lock, flags);
919
920 return events;
921}
922
923static const struct file_operations perf_fops = {
924 .release = perf_release,
925 .read = perf_read,
926 .poll = perf_poll,
927};
928
929static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
930{
931 return 0;
932}
933
934static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
935{
936}
937
938static void cpu_clock_perf_counter_read(struct perf_counter *counter)
939{
940 int cpu = raw_smp_processor_id();
941
942 atomic64_set(&counter->count, cpu_clock(cpu));
943}
944
945static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
946 .enable = cpu_clock_perf_counter_enable,
947 .disable = cpu_clock_perf_counter_disable,
948 .read = cpu_clock_perf_counter_read,
949};
950
951/*
952 * Called from within the scheduler:
953 */
954static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
955{
956 struct task_struct *curr = counter->task;
957 u64 delta;
958
959 delta = __task_delta_exec(curr, update);
960
961 return curr->se.sum_exec_runtime + delta;
962}
963
964static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
965{
966 u64 prev;
967 s64 delta;
968
969 prev = atomic64_read(&counter->hw.prev_count);
970
971 atomic64_set(&counter->hw.prev_count, now);
972
973 delta = now - prev;
974
975 atomic64_add(delta, &counter->count);
976}
977
978static void task_clock_perf_counter_read(struct perf_counter *counter)
979{
980 u64 now = task_clock_perf_counter_val(counter, 1);
981
982 task_clock_perf_counter_update(counter, now);
983}
984
985static int task_clock_perf_counter_enable(struct perf_counter *counter)
986{
987 u64 now = task_clock_perf_counter_val(counter, 0);
988
989 atomic64_set(&counter->hw.prev_count, now);
990
991 return 0;
992}
993
994static void task_clock_perf_counter_disable(struct perf_counter *counter)
995{
996 u64 now = task_clock_perf_counter_val(counter, 0);
997
998 task_clock_perf_counter_update(counter, now);
999}
1000
1001static const struct hw_perf_counter_ops perf_ops_task_clock = {
1002 .enable = task_clock_perf_counter_enable,
1003 .disable = task_clock_perf_counter_disable,
1004 .read = task_clock_perf_counter_read,
1005};
1006
1007static u64 get_page_faults(void)
1008{
1009 struct task_struct *curr = current;
1010
1011 return curr->maj_flt + curr->min_flt;
1012}
1013
1014static void page_faults_perf_counter_update(struct perf_counter *counter)
1015{
1016 u64 prev, now;
1017 s64 delta;
1018
1019 prev = atomic64_read(&counter->hw.prev_count);
1020 now = get_page_faults();
1021
1022 atomic64_set(&counter->hw.prev_count, now);
1023
1024 delta = now - prev;
1025
1026 atomic64_add(delta, &counter->count);
1027}
1028
1029static void page_faults_perf_counter_read(struct perf_counter *counter)
1030{
1031 page_faults_perf_counter_update(counter);
1032}
1033
1034static int page_faults_perf_counter_enable(struct perf_counter *counter)
1035{
1036 /*
1037 * page-faults is a per-task value already,
1038 * so we dont have to clear it on switch-in.
1039 */
1040
1041 return 0;
1042}
1043
1044static void page_faults_perf_counter_disable(struct perf_counter *counter)
1045{
1046 page_faults_perf_counter_update(counter);
1047}
1048
1049static const struct hw_perf_counter_ops perf_ops_page_faults = {
1050 .enable = page_faults_perf_counter_enable,
1051 .disable = page_faults_perf_counter_disable,
1052 .read = page_faults_perf_counter_read,
1053};
1054
1055static u64 get_context_switches(void)
1056{
1057 struct task_struct *curr = current;
1058
1059 return curr->nvcsw + curr->nivcsw;
1060}
1061
1062static void context_switches_perf_counter_update(struct perf_counter *counter)
1063{
1064 u64 prev, now;
1065 s64 delta;
1066
1067 prev = atomic64_read(&counter->hw.prev_count);
1068 now = get_context_switches();
1069
1070 atomic64_set(&counter->hw.prev_count, now);
1071
1072 delta = now - prev;
1073
1074 atomic64_add(delta, &counter->count);
1075}
1076
1077static void context_switches_perf_counter_read(struct perf_counter *counter)
1078{
1079 context_switches_perf_counter_update(counter);
1080}
1081
1082static int context_switches_perf_counter_enable(struct perf_counter *counter)
1083{
1084 /*
1085 * ->nvcsw + curr->nivcsw is a per-task value already,
1086 * so we dont have to clear it on switch-in.
1087 */
1088
1089 return 0;
1090}
1091
1092static void context_switches_perf_counter_disable(struct perf_counter *counter)
1093{
1094 context_switches_perf_counter_update(counter);
1095}
1096
1097static const struct hw_perf_counter_ops perf_ops_context_switches = {
1098 .enable = context_switches_perf_counter_enable,
1099 .disable = context_switches_perf_counter_disable,
1100 .read = context_switches_perf_counter_read,
1101};
1102
1103static inline u64 get_cpu_migrations(void)
1104{
1105 return current->se.nr_migrations;
1106}
1107
1108static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1109{
1110 u64 prev, now;
1111 s64 delta;
1112
1113 prev = atomic64_read(&counter->hw.prev_count);
1114 now = get_cpu_migrations();
1115
1116 atomic64_set(&counter->hw.prev_count, now);
1117
1118 delta = now - prev;
1119
1120 atomic64_add(delta, &counter->count);
1121}
1122
1123static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1124{
1125 cpu_migrations_perf_counter_update(counter);
1126}
1127
1128static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1129{
1130 /*
1131 * se.nr_migrations is a per-task value already,
1132 * so we dont have to clear it on switch-in.
1133 */
1134
1135 return 0;
1136}
1137
1138static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1139{
1140 cpu_migrations_perf_counter_update(counter);
1141}
1142
1143static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1144 .enable = cpu_migrations_perf_counter_enable,
1145 .disable = cpu_migrations_perf_counter_disable,
1146 .read = cpu_migrations_perf_counter_read,
1147};
1148
1149static const struct hw_perf_counter_ops *
1150sw_perf_counter_init(struct perf_counter *counter)
1151{
1152 const struct hw_perf_counter_ops *hw_ops = NULL;
1153
1154 switch (counter->hw_event.type) {
1155 case PERF_COUNT_CPU_CLOCK:
1156 hw_ops = &perf_ops_cpu_clock;
1157 break;
1158 case PERF_COUNT_TASK_CLOCK:
1159 hw_ops = &perf_ops_task_clock;
1160 break;
1161 case PERF_COUNT_PAGE_FAULTS:
1162 hw_ops = &perf_ops_page_faults;
1163 break;
1164 case PERF_COUNT_CONTEXT_SWITCHES:
1165 hw_ops = &perf_ops_context_switches;
1166 break;
1167 case PERF_COUNT_CPU_MIGRATIONS:
1168 hw_ops = &perf_ops_cpu_migrations;
1169 break;
1170 default:
1171 break;
1172 }
1173 return hw_ops;
1174}
1175
1176/*
1177 * Allocate and initialize a counter structure
1178 */
1179static struct perf_counter *
1180perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1181 int cpu,
1182 struct perf_counter *group_leader,
1183 gfp_t gfpflags)
1184{
1185 const struct hw_perf_counter_ops *hw_ops;
1186 struct perf_counter *counter;
1187
1188 counter = kzalloc(sizeof(*counter), gfpflags);
1189 if (!counter)
1190 return NULL;
1191
1192 /*
1193 * Single counters are their own group leaders, with an
1194 * empty sibling list:
1195 */
1196 if (!group_leader)
1197 group_leader = counter;
1198
1199 mutex_init(&counter->mutex);
1200 INIT_LIST_HEAD(&counter->list_entry);
1201 INIT_LIST_HEAD(&counter->sibling_list);
1202 init_waitqueue_head(&counter->waitq);
1203
1204 counter->irqdata = &counter->data[0];
1205 counter->usrdata = &counter->data[1];
1206 counter->cpu = cpu;
1207 counter->hw_event = *hw_event;
1208 counter->wakeup_pending = 0;
1209 counter->group_leader = group_leader;
1210 counter->hw_ops = NULL;
1211
1212 counter->state = PERF_COUNTER_STATE_INACTIVE;
1213 if (hw_event->disabled)
1214 counter->state = PERF_COUNTER_STATE_OFF;
1215
1216 hw_ops = NULL;
1217 if (!hw_event->raw && hw_event->type < 0)
1218 hw_ops = sw_perf_counter_init(counter);
1219 if (!hw_ops)
1220 hw_ops = hw_perf_counter_init(counter);
1221
1222 if (!hw_ops) {
1223 kfree(counter);
1224 return NULL;
1225 }
1226 counter->hw_ops = hw_ops;
1227
1228 return counter;
1229}
1230
1231/**
1232 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1233 *
1234 * @hw_event_uptr: event type attributes for monitoring/sampling
1235 * @pid: target pid
1236 * @cpu: target cpu
1237 * @group_fd: group leader counter fd
1238 */
1239asmlinkage int
1240sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1241 pid_t pid, int cpu, int group_fd)
1242{
1243 struct perf_counter *counter, *group_leader;
1244 struct perf_counter_hw_event hw_event;
1245 struct perf_counter_context *ctx;
1246 struct file *counter_file = NULL;
1247 struct file *group_file = NULL;
1248 int fput_needed = 0;
1249 int fput_needed2 = 0;
1250 int ret;
1251
1252 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1253 return -EFAULT;
1254
1255 /*
1256 * Get the target context (task or percpu):
1257 */
1258 ctx = find_get_context(pid, cpu);
1259 if (IS_ERR(ctx))
1260 return PTR_ERR(ctx);
1261
1262 /*
1263 * Look up the group leader (we will attach this counter to it):
1264 */
1265 group_leader = NULL;
1266 if (group_fd != -1) {
1267 ret = -EINVAL;
1268 group_file = fget_light(group_fd, &fput_needed);
1269 if (!group_file)
1270 goto err_put_context;
1271 if (group_file->f_op != &perf_fops)
1272 goto err_put_context;
1273
1274 group_leader = group_file->private_data;
1275 /*
1276 * Do not allow a recursive hierarchy (this new sibling
1277 * becoming part of another group-sibling):
1278 */
1279 if (group_leader->group_leader != group_leader)
1280 goto err_put_context;
1281 /*
1282 * Do not allow to attach to a group in a different
1283 * task or CPU context:
1284 */
1285 if (group_leader->ctx != ctx)
1286 goto err_put_context;
1287 }
1288
1289 ret = -EINVAL;
1290 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1291 if (!counter)
1292 goto err_put_context;
1293
1294 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1295 if (ret < 0)
1296 goto err_free_put_context;
1297
1298 counter_file = fget_light(ret, &fput_needed2);
1299 if (!counter_file)
1300 goto err_free_put_context;
1301
1302 counter->filp = counter_file;
1303 perf_install_in_context(ctx, counter, cpu);
1304
1305 fput_light(counter_file, fput_needed2);
1306
1307out_fput:
1308 fput_light(group_file, fput_needed);
1309
1310 return ret;
1311
1312err_free_put_context:
1313 kfree(counter);
1314
1315err_put_context:
1316 put_context(ctx);
1317
1318 goto out_fput;
1319}
1320
1321/*
1322 * Initialize the perf_counter context in a task_struct:
1323 */
1324static void
1325__perf_counter_init_context(struct perf_counter_context *ctx,
1326 struct task_struct *task)
1327{
1328 memset(ctx, 0, sizeof(*ctx));
1329 spin_lock_init(&ctx->lock);
1330 INIT_LIST_HEAD(&ctx->counter_list);
1331 ctx->task = task;
1332}
1333
1334/*
1335 * inherit a counter from parent task to child task:
1336 */
1337static int
1338inherit_counter(struct perf_counter *parent_counter,
1339 struct task_struct *parent,
1340 struct perf_counter_context *parent_ctx,
1341 struct task_struct *child,
1342 struct perf_counter_context *child_ctx)
1343{
1344 struct perf_counter *child_counter;
1345
1346 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1347 parent_counter->cpu, NULL,
1348 GFP_ATOMIC);
1349 if (!child_counter)
1350 return -ENOMEM;
1351
1352 /*
1353 * Link it up in the child's context:
1354 */
1355 child_counter->ctx = child_ctx;
1356 child_counter->task = child;
1357 list_add_counter(child_counter, child_ctx);
1358 child_ctx->nr_counters++;
1359
1360 child_counter->parent = parent_counter;
1361 /*
1362 * inherit into child's child as well:
1363 */
1364 child_counter->hw_event.inherit = 1;
1365
1366 /*
1367 * Get a reference to the parent filp - we will fput it
1368 * when the child counter exits. This is safe to do because
1369 * we are in the parent and we know that the filp still
1370 * exists and has a nonzero count:
1371 */
1372 atomic_long_inc(&parent_counter->filp->f_count);
1373
1374 return 0;
1375}
1376
1377static void
1378__perf_counter_exit_task(struct task_struct *child,
1379 struct perf_counter *child_counter,
1380 struct perf_counter_context *child_ctx)
1381{
1382 struct perf_counter *parent_counter;
1383 u64 parent_val, child_val;
1384
1385 /*
1386 * If we do not self-reap then we have to wait for the
1387 * child task to unschedule (it will happen for sure),
1388 * so that its counter is at its final count. (This
1389 * condition triggers rarely - child tasks usually get
1390 * off their CPU before the parent has a chance to
1391 * get this far into the reaping action)
1392 */
1393 if (child != current) {
1394 wait_task_inactive(child, 0);
1395 list_del_init(&child_counter->list_entry);
1396 } else {
1397 struct perf_cpu_context *cpuctx;
1398 unsigned long flags;
1399 u64 perf_flags;
1400
1401 /*
1402 * Disable and unlink this counter.
1403 *
1404 * Be careful about zapping the list - IRQ/NMI context
1405 * could still be processing it:
1406 */
1407 curr_rq_lock_irq_save(&flags);
1408 perf_flags = hw_perf_save_disable();
1409
1410 cpuctx = &__get_cpu_var(perf_cpu_context);
1411
1412 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
1413 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1414 child_counter->hw_ops->disable(child_counter);
1415 cpuctx->active_oncpu--;
1416 child_ctx->nr_active--;
1417 child_counter->oncpu = -1;
1418 }
1419
1420 list_del_init(&child_counter->list_entry);
1421
1422 child_ctx->nr_counters--;
1423
1424 hw_perf_restore(perf_flags);
1425 curr_rq_unlock_irq_restore(&flags);
1426 }
1427
1428 parent_counter = child_counter->parent;
1429 /*
1430 * It can happen that parent exits first, and has counters
1431 * that are still around due to the child reference. These
1432 * counters need to be zapped - but otherwise linger.
1433 */
1434 if (!parent_counter)
1435 return;
1436
1437 parent_val = atomic64_read(&parent_counter->count);
1438 child_val = atomic64_read(&child_counter->count);
1439
1440 /*
1441 * Add back the child's count to the parent's count:
1442 */
1443 atomic64_add(child_val, &parent_counter->count);
1444
1445 fput(parent_counter->filp);
1446
1447 kfree(child_counter);
1448}
1449
1450/*
1451 * When a child task exist, feed back counter values to parent counters.
1452 *
1453 * Note: we are running in child context, but the PID is not hashed
1454 * anymore so new counters will not be added.
1455 */
1456void perf_counter_exit_task(struct task_struct *child)
1457{
1458 struct perf_counter *child_counter, *tmp;
1459 struct perf_counter_context *child_ctx;
1460
1461 child_ctx = &child->perf_counter_ctx;
1462
1463 if (likely(!child_ctx->nr_counters))
1464 return;
1465
1466 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1467 list_entry)
1468 __perf_counter_exit_task(child, child_counter, child_ctx);
1469}
1470
1471/*
1472 * Initialize the perf_counter context in task_struct
1473 */
1474void perf_counter_init_task(struct task_struct *child)
1475{
1476 struct perf_counter_context *child_ctx, *parent_ctx;
1477 struct perf_counter *counter, *parent_counter;
1478 struct task_struct *parent = current;
1479 unsigned long flags;
1480
1481 child_ctx = &child->perf_counter_ctx;
1482 parent_ctx = &parent->perf_counter_ctx;
1483
1484 __perf_counter_init_context(child_ctx, child);
1485
1486 /*
1487 * This is executed from the parent task context, so inherit
1488 * counters that have been marked for cloning:
1489 */
1490
1491 if (likely(!parent_ctx->nr_counters))
1492 return;
1493
1494 /*
1495 * Lock the parent list. No need to lock the child - not PID
1496 * hashed yet and not running, so nobody can access it.
1497 */
1498 spin_lock_irqsave(&parent_ctx->lock, flags);
1499
1500 /*
1501 * We dont have to disable NMIs - we are only looking at
1502 * the list, not manipulating it:
1503 */
1504 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1505 if (!counter->hw_event.inherit || counter->group_leader != counter)
1506 continue;
1507
1508 /*
1509 * Instead of creating recursive hierarchies of counters,
1510 * we link inheritd counters back to the original parent,
1511 * which has a filp for sure, which we use as the reference
1512 * count:
1513 */
1514 parent_counter = counter;
1515 if (counter->parent)
1516 parent_counter = counter->parent;
1517
1518 if (inherit_counter(parent_counter, parent,
1519 parent_ctx, child, child_ctx))
1520 break;
1521 }
1522
1523 spin_unlock_irqrestore(&parent_ctx->lock, flags);
1524}
1525
1526static void __cpuinit perf_counter_init_cpu(int cpu)
1527{
1528 struct perf_cpu_context *cpuctx;
1529
1530 cpuctx = &per_cpu(perf_cpu_context, cpu);
1531 __perf_counter_init_context(&cpuctx->ctx, NULL);
1532
1533 mutex_lock(&perf_resource_mutex);
1534 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
1535 mutex_unlock(&perf_resource_mutex);
1536
1537 hw_perf_counter_setup();
1538}
1539
1540#ifdef CONFIG_HOTPLUG_CPU
1541static void __perf_counter_exit_cpu(void *info)
1542{
1543 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1544 struct perf_counter_context *ctx = &cpuctx->ctx;
1545 struct perf_counter *counter, *tmp;
1546
1547 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1548 __perf_counter_remove_from_context(counter);
1549
1550}
1551static void perf_counter_exit_cpu(int cpu)
1552{
1553 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
1554}
1555#else
1556static inline void perf_counter_exit_cpu(int cpu) { }
1557#endif
1558
1559static int __cpuinit
1560perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
1561{
1562 unsigned int cpu = (long)hcpu;
1563
1564 switch (action) {
1565
1566 case CPU_UP_PREPARE:
1567 case CPU_UP_PREPARE_FROZEN:
1568 perf_counter_init_cpu(cpu);
1569 break;
1570
1571 case CPU_DOWN_PREPARE:
1572 case CPU_DOWN_PREPARE_FROZEN:
1573 perf_counter_exit_cpu(cpu);
1574 break;
1575
1576 default:
1577 break;
1578 }
1579
1580 return NOTIFY_OK;
1581}
1582
1583static struct notifier_block __cpuinitdata perf_cpu_nb = {
1584 .notifier_call = perf_cpu_notify,
1585};
1586
1587static int __init perf_counter_init(void)
1588{
1589 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1590 (void *)(long)smp_processor_id());
1591 register_cpu_notifier(&perf_cpu_nb);
1592
1593 return 0;
1594}
1595early_initcall(perf_counter_init);
1596
1597static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1598{
1599 return sprintf(buf, "%d\n", perf_reserved_percpu);
1600}
1601
1602static ssize_t
1603perf_set_reserve_percpu(struct sysdev_class *class,
1604 const char *buf,
1605 size_t count)
1606{
1607 struct perf_cpu_context *cpuctx;
1608 unsigned long val;
1609 int err, cpu, mpt;
1610
1611 err = strict_strtoul(buf, 10, &val);
1612 if (err)
1613 return err;
1614 if (val > perf_max_counters)
1615 return -EINVAL;
1616
1617 mutex_lock(&perf_resource_mutex);
1618 perf_reserved_percpu = val;
1619 for_each_online_cpu(cpu) {
1620 cpuctx = &per_cpu(perf_cpu_context, cpu);
1621 spin_lock_irq(&cpuctx->ctx.lock);
1622 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1623 perf_max_counters - perf_reserved_percpu);
1624 cpuctx->max_pertask = mpt;
1625 spin_unlock_irq(&cpuctx->ctx.lock);
1626 }
1627 mutex_unlock(&perf_resource_mutex);
1628
1629 return count;
1630}
1631
1632static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1633{
1634 return sprintf(buf, "%d\n", perf_overcommit);
1635}
1636
1637static ssize_t
1638perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1639{
1640 unsigned long val;
1641 int err;
1642
1643 err = strict_strtoul(buf, 10, &val);
1644 if (err)
1645 return err;
1646 if (val > 1)
1647 return -EINVAL;
1648
1649 mutex_lock(&perf_resource_mutex);
1650 perf_overcommit = val;
1651 mutex_unlock(&perf_resource_mutex);
1652
1653 return count;
1654}
1655
1656static SYSDEV_CLASS_ATTR(
1657 reserve_percpu,
1658 0644,
1659 perf_show_reserve_percpu,
1660 perf_set_reserve_percpu
1661 );
1662
1663static SYSDEV_CLASS_ATTR(
1664 overcommit,
1665 0644,
1666 perf_show_overcommit,
1667 perf_set_overcommit
1668 );
1669
1670static struct attribute *perfclass_attrs[] = {
1671 &attr_reserve_percpu.attr,
1672 &attr_overcommit.attr,
1673 NULL
1674};
1675
1676static struct attribute_group perfclass_attr_group = {
1677 .attrs = perfclass_attrs,
1678 .name = "perf_counters",
1679};
1680
1681static int __init perf_counter_sysfs_init(void)
1682{
1683 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1684 &perfclass_attr_group);
1685}
1686device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 748ff924a290..3dfbff5fb1ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -658,7 +658,7 @@ static inline int cpu_of(struct rq *rq)
658#define task_rq(p) cpu_rq(task_cpu(p)) 658#define task_rq(p) cpu_rq(task_cpu(p))
659#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 659#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
660 660
661static inline void update_rq_clock(struct rq *rq) 661inline void update_rq_clock(struct rq *rq)
662{ 662{
663 rq->clock = sched_clock_cpu(cpu_of(rq)); 663 rq->clock = sched_clock_cpu(cpu_of(rq));
664} 664}
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 } 969 }
970} 970}
971 971
972void curr_rq_lock_irq_save(unsigned long *flags)
973 __acquires(rq->lock)
974{
975 struct rq *rq;
976
977 local_irq_save(*flags);
978 rq = cpu_rq(smp_processor_id());
979 spin_lock(&rq->lock);
980}
981
982void curr_rq_unlock_irq_restore(unsigned long *flags)
983 __releases(rq->lock)
984{
985 struct rq *rq;
986
987 rq = cpu_rq(smp_processor_id());
988 spin_unlock(&rq->lock);
989 local_irq_restore(*flags);
990}
991
972void task_rq_unlock_wait(struct task_struct *p) 992void task_rq_unlock_wait(struct task_struct *p)
973{ 993{
974 struct rq *rq = task_rq(p); 994 struct rq *rq = task_rq(p);
@@ -1876,12 +1896,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1876 p->se.sleep_start -= clock_offset; 1896 p->se.sleep_start -= clock_offset;
1877 if (p->se.block_start) 1897 if (p->se.block_start)
1878 p->se.block_start -= clock_offset; 1898 p->se.block_start -= clock_offset;
1899#endif
1879 if (old_cpu != new_cpu) { 1900 if (old_cpu != new_cpu) {
1880 schedstat_inc(p, se.nr_migrations); 1901 p->se.nr_migrations++;
1902#ifdef CONFIG_SCHEDSTATS
1881 if (task_hot(p, old_rq->clock, NULL)) 1903 if (task_hot(p, old_rq->clock, NULL))
1882 schedstat_inc(p, se.nr_forced2_migrations); 1904 schedstat_inc(p, se.nr_forced2_migrations);
1883 }
1884#endif 1905#endif
1906 }
1885 p->se.vruntime -= old_cfsrq->min_vruntime - 1907 p->se.vruntime -= old_cfsrq->min_vruntime -
1886 new_cfsrq->min_vruntime; 1908 new_cfsrq->min_vruntime;
1887 1909
@@ -2236,6 +2258,27 @@ static int sched_balance_self(int cpu, int flag)
2236 2258
2237#endif /* CONFIG_SMP */ 2259#endif /* CONFIG_SMP */
2238 2260
2261/**
2262 * task_oncpu_function_call - call a function on the cpu on which a task runs
2263 * @p: the task to evaluate
2264 * @func: the function to be called
2265 * @info: the function call argument
2266 *
2267 * Calls the function @func when the task is currently running. This might
2268 * be on the current CPU, which just calls the function directly
2269 */
2270void task_oncpu_function_call(struct task_struct *p,
2271 void (*func) (void *info), void *info)
2272{
2273 int cpu;
2274
2275 preempt_disable();
2276 cpu = task_cpu(p);
2277 if (task_curr(p))
2278 smp_call_function_single(cpu, func, info, 1);
2279 preempt_enable();
2280}
2281
2239/*** 2282/***
2240 * try_to_wake_up - wake up a thread 2283 * try_to_wake_up - wake up a thread
2241 * @p: the to-be-woken-up thread 2284 * @p: the to-be-woken-up thread
@@ -2378,6 +2421,7 @@ static void __sched_fork(struct task_struct *p)
2378 p->se.exec_start = 0; 2421 p->se.exec_start = 0;
2379 p->se.sum_exec_runtime = 0; 2422 p->se.sum_exec_runtime = 0;
2380 p->se.prev_sum_exec_runtime = 0; 2423 p->se.prev_sum_exec_runtime = 0;
2424 p->se.nr_migrations = 0;
2381 p->se.last_wakeup = 0; 2425 p->se.last_wakeup = 0;
2382 p->se.avg_overlap = 0; 2426 p->se.avg_overlap = 0;
2383 2427
@@ -2598,6 +2642,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2598 */ 2642 */
2599 prev_state = prev->state; 2643 prev_state = prev->state;
2600 finish_arch_switch(prev); 2644 finish_arch_switch(prev);
2645 perf_counter_task_sched_in(current, cpu_of(rq));
2601 finish_lock_switch(rq, prev); 2646 finish_lock_switch(rq, prev);
2602#ifdef CONFIG_SMP 2647#ifdef CONFIG_SMP
2603 if (current->sched_class->post_schedule) 2648 if (current->sched_class->post_schedule)
@@ -4056,6 +4101,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
4056 * Return any ns on the sched_clock that have not yet been banked in 4101 * Return any ns on the sched_clock that have not yet been banked in
4057 * @p in case that task is currently running. 4102 * @p in case that task is currently running.
4058 */ 4103 */
4104unsigned long long __task_delta_exec(struct task_struct *p, int update)
4105{
4106 s64 delta_exec;
4107 struct rq *rq;
4108
4109 rq = task_rq(p);
4110 WARN_ON_ONCE(!runqueue_is_locked());
4111 WARN_ON_ONCE(!task_current(rq, p));
4112
4113 if (update)
4114 update_rq_clock(rq);
4115
4116 delta_exec = rq->clock - p->se.exec_start;
4117
4118 WARN_ON_ONCE(delta_exec < 0);
4119
4120 return delta_exec;
4121}
4122
4123/*
4124 * Return any ns on the sched_clock that have not yet been banked in
4125 * @p in case that task is currently running.
4126 */
4059unsigned long long task_delta_exec(struct task_struct *p) 4127unsigned long long task_delta_exec(struct task_struct *p)
4060{ 4128{
4061 unsigned long flags; 4129 unsigned long flags;
@@ -4279,6 +4347,7 @@ void scheduler_tick(void)
4279 update_rq_clock(rq); 4347 update_rq_clock(rq);
4280 update_cpu_load(rq); 4348 update_cpu_load(rq);
4281 curr->sched_class->task_tick(rq, curr, 0); 4349 curr->sched_class->task_tick(rq, curr, 0);
4350 perf_counter_task_tick(curr, cpu);
4282 spin_unlock(&rq->lock); 4351 spin_unlock(&rq->lock);
4283 4352
4284#ifdef CONFIG_SMP 4353#ifdef CONFIG_SMP
@@ -4474,6 +4543,7 @@ need_resched_nonpreemptible:
4474 4543
4475 if (likely(prev != next)) { 4544 if (likely(prev != next)) {
4476 sched_info_switch(prev, next); 4545 sched_info_switch(prev, next);
4546 perf_counter_task_sched_out(prev, cpu);
4477 4547
4478 rq->nr_switches++; 4548 rq->nr_switches++;
4479 rq->curr = next; 4549 rq->curr = next;
diff --git a/kernel/sys.c b/kernel/sys.c
index ebe65c2c9873..1544c305751e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a23281707..4be8bbc7577c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
174cond_syscall(compat_sys_timerfd_gettime); 174cond_syscall(compat_sys_timerfd_gettime);
175cond_syscall(sys_eventfd); 175cond_syscall(sys_eventfd);
176cond_syscall(sys_eventfd2); 176cond_syscall(sys_eventfd2);
177
178/* performance counters: */
179cond_syscall(sys_perf_counter_open);