aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2008-12-04 14:12:29 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-08 09:47:03 -0500
commit0793a61d4df8daeac6492dbf8d2f3e5713caae5e (patch)
treecc9603eb8daffeb7ace521c42a6a44db164ac551
parentb5aa97e83bcc31a96374d18f5452d53909a16c90 (diff)
performance counters: core code
Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--include/linux/perf_counter.h171
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--init/Kconfig29
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/perf_counter.c943
-rw-r--r--kernel/sched.c24
-rw-r--r--kernel/sys_ni.c3
10 files changed, 1189 insertions, 0 deletions
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ce0d9da52a8..52146c2a8d9 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 00000000000..22c4469abf4
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,171 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <asm/atomic.h>
17
18#include <linux/list.h>
19#include <linux/mutex.h>
20#include <linux/rculist.h>
21#include <linux/rcupdate.h>
22#include <linux/spinlock.h>
23
24struct task_struct;
25
26/*
27 * Generalized hardware event types, used by the hw_event_type parameter
28 * of the sys_perf_counter_open() syscall:
29 */
30enum hw_event_types {
31 PERF_COUNT_CYCLES,
32 PERF_COUNT_INSTRUCTIONS,
33 PERF_COUNT_CACHE_REFERENCES,
34 PERF_COUNT_CACHE_MISSES,
35 PERF_COUNT_BRANCH_INSTRUCTIONS,
36 PERF_COUNT_BRANCH_MISSES,
37 /*
38 * If this bit is set in the type, then trigger NMI sampling:
39 */
40 PERF_COUNT_NMI = (1 << 30),
41};
42
43/*
44 * IRQ-notification data record type:
45 */
46enum perf_record_type {
47 PERF_RECORD_SIMPLE,
48 PERF_RECORD_IRQ,
49 PERF_RECORD_GROUP,
50};
51
52/**
53 * struct hw_perf_counter - performance counter hardware details
54 */
55struct hw_perf_counter {
56 u64 config;
57 unsigned long config_base;
58 unsigned long counter_base;
59 int nmi;
60 unsigned int idx;
61 u64 prev_count;
62 s32 next_count;
63 u64 irq_period;
64};
65
66/*
67 * Hardcoded buffer length limit for now, for IRQ-fed events:
68 */
69#define PERF_DATA_BUFLEN 2048
70
71/**
72 * struct perf_data - performance counter IRQ data sampling ...
73 */
74struct perf_data {
75 int len;
76 int rd_idx;
77 int overrun;
78 u8 data[PERF_DATA_BUFLEN];
79};
80
81/**
82 * struct perf_counter - performance counter kernel representation:
83 */
84struct perf_counter {
85 struct list_head list;
86 int active;
87#if BITS_PER_LONG == 64
88 atomic64_t count;
89#else
90 atomic_t count32[2];
91#endif
92 u64 __irq_period;
93
94 struct hw_perf_counter hw;
95
96 struct perf_counter_context *ctx;
97 struct task_struct *task;
98
99 /*
100 * Protect attach/detach:
101 */
102 struct mutex mutex;
103
104 int oncpu;
105 int cpu;
106
107 s32 hw_event_type;
108 enum perf_record_type record_type;
109
110 /* read() / irq related data */
111 wait_queue_head_t waitq;
112 /* optional: for NMIs */
113 int wakeup_pending;
114 struct perf_data *irqdata;
115 struct perf_data *usrdata;
116 struct perf_data data[2];
117};
118
119/**
120 * struct perf_counter_context - counter context structure
121 *
122 * Used as a container for task counters and CPU counters as well:
123 */
124struct perf_counter_context {
125#ifdef CONFIG_PERF_COUNTERS
126 /*
127 * Protect the list of counters:
128 */
129 spinlock_t lock;
130 struct list_head counters;
131 int nr_counters;
132 int nr_active;
133 struct task_struct *task;
134#endif
135};
136
137/**
138 * struct perf_counter_cpu_context - per cpu counter context structure
139 */
140struct perf_cpu_context {
141 struct perf_counter_context ctx;
142 struct perf_counter_context *task_ctx;
143 int active_oncpu;
144 int max_pertask;
145};
146
147/*
148 * Set by architecture code:
149 */
150extern int perf_max_counters;
151
152#ifdef CONFIG_PERF_COUNTERS
153extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
154extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
155extern void perf_counter_task_tick(struct task_struct *task, int cpu);
156extern void perf_counter_init_task(struct task_struct *task);
157extern void perf_counter_notify(struct pt_regs *regs);
158extern void perf_counter_print_debug(void);
159#else
160static inline void
161perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
162static inline void
163perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
164static inline void
165perf_counter_task_tick(struct task_struct *task, int cpu) { }
166static inline void perf_counter_init_task(struct task_struct *task) { }
167static inline void perf_counter_notify(struct pt_regs *regs) { }
168static inline void perf_counter_print_debug(void) { }
169#endif
170
171#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d11447..4c530278391 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -1326,6 +1327,7 @@ struct task_struct {
1326 struct list_head pi_state_list; 1327 struct list_head pi_state_list;
1327 struct futex_pi_state *pi_state_cache; 1328 struct futex_pi_state *pi_state_cache;
1328#endif 1329#endif
1330 struct perf_counter_context perf_counter_ctx;
1329#ifdef CONFIG_NUMA 1331#ifdef CONFIG_NUMA
1330 struct mempolicy *mempolicy; 1332 struct mempolicy *mempolicy;
1331 short il_next; 1333 short il_next;
@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2285#define TASK_SIZE_OF(tsk) TASK_SIZE 2287#define TASK_SIZE_OF(tsk) TASK_SIZE
2286#endif 2288#endif
2287 2289
2290/*
2291 * Call the function if the target task is executing on a CPU right now:
2292 */
2293extern void task_oncpu_function_call(struct task_struct *p,
2294 void (*func) (void *info), void *info);
2295
2296
2288#ifdef CONFIG_MM_OWNER 2297#ifdef CONFIG_MM_OWNER
2289extern void mm_update_next_owner(struct mm_struct *mm); 2298extern void mm_update_next_owner(struct mm_struct *mm);
2290extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2299extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 04fb47bfb92..6cce728a626 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
624 624
625int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 625int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
626 626
627asmlinkage int
628sys_perf_counter_open(u32 hw_event_type,
629 u32 hw_event_period,
630 u32 record_type,
631 pid_t pid,
632 int cpu);
627#endif 633#endif
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544..78bede218f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -732,6 +732,35 @@ config AIO
732 by some high performance threaded applications. Disabling 732 by some high performance threaded applications. Disabling
733 this option saves about 7k. 733 this option saves about 7k.
734 734
735config HAVE_PERF_COUNTERS
736 bool
737
738menu "Performance Counters"
739
740config PERF_COUNTERS
741 bool "Kernel Performance Counters"
742 depends on HAVE_PERF_COUNTERS
743 default y
744 help
745 Enable kernel support for performance counter hardware.
746
747 Performance counters are special hardware registers available
748 on most modern CPUs. These registers count the number of certain
749 types of hw events: such as instructions executed, cachemisses
750 suffered, or branches mis-predicted - without slowing down the
751 kernel or applications. These registers can also trigger interrupts
752 when a threshold number of events have passed - and can thus be
753 used to profile the code that runs on that CPU.
754
755 The Linux Performance Counter subsystem provides an abstraction of
756 these hardware capabilities, available via a system call. It
757 provides per task and per CPU counters, and it provides event
758 capabilities on top of those.
759
760 Say Y if unsure.
761
762endmenu
763
735config VM_EVENT_COUNTERS 764config VM_EVENT_COUNTERS
736 default y 765 default y
737 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 766 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19..1f184a1dc40 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
89obj-$(CONFIG_FUNCTION_TRACER) += trace/ 89obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 90obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 91obj-$(CONFIG_SMP) += sched_cpupri.o
92obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
92 93
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 94ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 95# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0e206..441fadff1fa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
975 goto fork_out; 975 goto fork_out;
976 976
977 rt_mutex_init_task(p); 977 rt_mutex_init_task(p);
978 perf_counter_init_task(p);
978 979
979#ifdef CONFIG_PROVE_LOCKING 980#ifdef CONFIG_PROVE_LOCKING
980 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 981 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 00000000000..20508f05365
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,943 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/poll.h>
14#include <linux/sysfs.h>
15#include <linux/ptrace.h>
16#include <linux/percpu.h>
17#include <linux/uaccess.h>
18#include <linux/syscalls.h>
19#include <linux/anon_inodes.h>
20#include <linux/perf_counter.h>
21
22/*
23 * Each CPU has a list of per CPU counters:
24 */
25DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
26
27int perf_max_counters __read_mostly;
28static int perf_reserved_percpu __read_mostly;
29static int perf_overcommit __read_mostly = 1;
30
31/*
32 * Mutex for (sysadmin-configurable) counter reservations:
33 */
34static DEFINE_MUTEX(perf_resource_mutex);
35
36/*
37 * Architecture provided APIs - weak aliases:
38 */
39
40int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
41{
42 return -EINVAL;
43}
44
45void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
46void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
47void __weak hw_perf_counter_read(struct perf_counter *counter) { }
48void __weak hw_perf_disable_all(void) { }
49void __weak hw_perf_enable_all(void) { }
50void __weak hw_perf_counter_setup(void) { }
51
52#if BITS_PER_LONG == 64
53
54/*
55 * Read the cached counter in counter safe against cross CPU / NMI
56 * modifications. 64 bit version - no complications.
57 */
58static inline u64 perf_read_counter_safe(struct perf_counter *counter)
59{
60 return (u64) atomic64_read(&counter->count);
61}
62
63#else
64
65/*
66 * Read the cached counter in counter safe against cross CPU / NMI
67 * modifications. 32 bit version.
68 */
69static u64 perf_read_counter_safe(struct perf_counter *counter)
70{
71 u32 cntl, cnth;
72
73 local_irq_disable();
74 do {
75 cnth = atomic_read(&counter->count32[1]);
76 cntl = atomic_read(&counter->count32[0]);
77 } while (cnth != atomic_read(&counter->count32[1]));
78
79 local_irq_enable();
80
81 return cntl | ((u64) cnth) << 32;
82}
83
84#endif
85
86/*
87 * Cross CPU call to remove a performance counter
88 *
89 * We disable the counter on the hardware level first. After that we
90 * remove it from the context list.
91 */
92static void __perf_remove_from_context(void *info)
93{
94 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
95 struct perf_counter *counter = info;
96 struct perf_counter_context *ctx = counter->ctx;
97
98 /*
99 * If this is a task context, we need to check whether it is
100 * the current task context of this cpu. If not it has been
101 * scheduled out before the smp call arrived.
102 */
103 if (ctx->task && cpuctx->task_ctx != ctx)
104 return;
105
106 spin_lock(&ctx->lock);
107
108 if (counter->active) {
109 hw_perf_counter_disable(counter);
110 counter->active = 0;
111 ctx->nr_active--;
112 cpuctx->active_oncpu--;
113 counter->task = NULL;
114 }
115 ctx->nr_counters--;
116
117 /*
118 * Protect the list operation against NMI by disabling the
119 * counters on a global level. NOP for non NMI based counters.
120 */
121 hw_perf_disable_all();
122 list_del_init(&counter->list);
123 hw_perf_enable_all();
124
125 if (!ctx->task) {
126 /*
127 * Allow more per task counters with respect to the
128 * reservation:
129 */
130 cpuctx->max_pertask =
131 min(perf_max_counters - ctx->nr_counters,
132 perf_max_counters - perf_reserved_percpu);
133 }
134
135 spin_unlock(&ctx->lock);
136}
137
138
139/*
140 * Remove the counter from a task's (or a CPU's) list of counters.
141 *
142 * Must be called with counter->mutex held.
143 *
144 * CPU counters are removed with a smp call. For task counters we only
145 * call when the task is on a CPU.
146 */
147static void perf_remove_from_context(struct perf_counter *counter)
148{
149 struct perf_counter_context *ctx = counter->ctx;
150 struct task_struct *task = ctx->task;
151
152 if (!task) {
153 /*
154 * Per cpu counters are removed via an smp call and
155 * the removal is always sucessful.
156 */
157 smp_call_function_single(counter->cpu,
158 __perf_remove_from_context,
159 counter, 1);
160 return;
161 }
162
163retry:
164 task_oncpu_function_call(task, __perf_remove_from_context,
165 counter);
166
167 spin_lock_irq(&ctx->lock);
168 /*
169 * If the context is active we need to retry the smp call.
170 */
171 if (ctx->nr_active && !list_empty(&counter->list)) {
172 spin_unlock_irq(&ctx->lock);
173 goto retry;
174 }
175
176 /*
177 * The lock prevents that this context is scheduled in so we
178 * can remove the counter safely, if it the call above did not
179 * succeed.
180 */
181 if (!list_empty(&counter->list)) {
182 ctx->nr_counters--;
183 list_del_init(&counter->list);
184 counter->task = NULL;
185 }
186 spin_unlock_irq(&ctx->lock);
187}
188
189/*
190 * Cross CPU call to install and enable a preformance counter
191 */
192static void __perf_install_in_context(void *info)
193{
194 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
195 struct perf_counter *counter = info;
196 struct perf_counter_context *ctx = counter->ctx;
197 int cpu = smp_processor_id();
198
199 /*
200 * If this is a task context, we need to check whether it is
201 * the current task context of this cpu. If not it has been
202 * scheduled out before the smp call arrived.
203 */
204 if (ctx->task && cpuctx->task_ctx != ctx)
205 return;
206
207 spin_lock(&ctx->lock);
208
209 /*
210 * Protect the list operation against NMI by disabling the
211 * counters on a global level. NOP for non NMI based counters.
212 */
213 hw_perf_disable_all();
214 list_add_tail(&counter->list, &ctx->counters);
215 hw_perf_enable_all();
216
217 ctx->nr_counters++;
218
219 if (cpuctx->active_oncpu < perf_max_counters) {
220 hw_perf_counter_enable(counter);
221 counter->active = 1;
222 counter->oncpu = cpu;
223 ctx->nr_active++;
224 cpuctx->active_oncpu++;
225 }
226
227 if (!ctx->task && cpuctx->max_pertask)
228 cpuctx->max_pertask--;
229
230 spin_unlock(&ctx->lock);
231}
232
233/*
234 * Attach a performance counter to a context
235 *
236 * First we add the counter to the list with the hardware enable bit
237 * in counter->hw_config cleared.
238 *
239 * If the counter is attached to a task which is on a CPU we use a smp
240 * call to enable it in the task context. The task might have been
241 * scheduled away, but we check this in the smp call again.
242 */
243static void
244perf_install_in_context(struct perf_counter_context *ctx,
245 struct perf_counter *counter,
246 int cpu)
247{
248 struct task_struct *task = ctx->task;
249
250 counter->ctx = ctx;
251 if (!task) {
252 /*
253 * Per cpu counters are installed via an smp call and
254 * the install is always sucessful.
255 */
256 smp_call_function_single(cpu, __perf_install_in_context,
257 counter, 1);
258 return;
259 }
260
261 counter->task = task;
262retry:
263 task_oncpu_function_call(task, __perf_install_in_context,
264 counter);
265
266 spin_lock_irq(&ctx->lock);
267 /*
268 * If the context is active and the counter has not been added
269 * we need to retry the smp call.
270 */
271 if (ctx->nr_active && list_empty(&counter->list)) {
272 spin_unlock_irq(&ctx->lock);
273 goto retry;
274 }
275
276 /*
277 * The lock prevents that this context is scheduled in so we
278 * can add the counter safely, if it the call above did not
279 * succeed.
280 */
281 if (list_empty(&counter->list)) {
282 list_add_tail(&counter->list, &ctx->counters);
283 ctx->nr_counters++;
284 }
285 spin_unlock_irq(&ctx->lock);
286}
287
288/*
289 * Called from scheduler to remove the counters of the current task,
290 * with interrupts disabled.
291 *
292 * We stop each counter and update the counter value in counter->count.
293 *
294 * This does not protect us against NMI, but hw_perf_counter_disable()
295 * sets the disabled bit in the control field of counter _before_
296 * accessing the counter control register. If a NMI hits, then it will
297 * not restart the counter.
298 */
299void perf_counter_task_sched_out(struct task_struct *task, int cpu)
300{
301 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
302 struct perf_counter_context *ctx = &task->perf_counter_ctx;
303 struct perf_counter *counter;
304
305 if (likely(!cpuctx->task_ctx))
306 return;
307
308 spin_lock(&ctx->lock);
309 list_for_each_entry(counter, &ctx->counters, list) {
310 if (!ctx->nr_active)
311 break;
312 if (counter->active) {
313 hw_perf_counter_disable(counter);
314 counter->active = 0;
315 counter->oncpu = -1;
316 ctx->nr_active--;
317 cpuctx->active_oncpu--;
318 }
319 }
320 spin_unlock(&ctx->lock);
321 cpuctx->task_ctx = NULL;
322}
323
324/*
325 * Called from scheduler to add the counters of the current task
326 * with interrupts disabled.
327 *
328 * We restore the counter value and then enable it.
329 *
330 * This does not protect us against NMI, but hw_perf_counter_enable()
331 * sets the enabled bit in the control field of counter _before_
332 * accessing the counter control register. If a NMI hits, then it will
333 * keep the counter running.
334 */
335void perf_counter_task_sched_in(struct task_struct *task, int cpu)
336{
337 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
338 struct perf_counter_context *ctx = &task->perf_counter_ctx;
339 struct perf_counter *counter;
340
341 if (likely(!ctx->nr_counters))
342 return;
343
344 spin_lock(&ctx->lock);
345 list_for_each_entry(counter, &ctx->counters, list) {
346 if (ctx->nr_active == cpuctx->max_pertask)
347 break;
348 if (counter->cpu != -1 && counter->cpu != cpu)
349 continue;
350
351 hw_perf_counter_enable(counter);
352 counter->active = 1;
353 counter->oncpu = cpu;
354 ctx->nr_active++;
355 cpuctx->active_oncpu++;
356 }
357 spin_unlock(&ctx->lock);
358 cpuctx->task_ctx = ctx;
359}
360
361void perf_counter_task_tick(struct task_struct *curr, int cpu)
362{
363 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
364 struct perf_counter *counter;
365
366 if (likely(!ctx->nr_counters))
367 return;
368
369 perf_counter_task_sched_out(curr, cpu);
370
371 spin_lock(&ctx->lock);
372
373 /*
374 * Rotate the first entry last:
375 */
376 hw_perf_disable_all();
377 list_for_each_entry(counter, &ctx->counters, list) {
378 list_del(&counter->list);
379 list_add_tail(&counter->list, &ctx->counters);
380 break;
381 }
382 hw_perf_enable_all();
383
384 spin_unlock(&ctx->lock);
385
386 perf_counter_task_sched_in(curr, cpu);
387}
388
389/*
390 * Initialize the perf_counter context in task_struct
391 */
392void perf_counter_init_task(struct task_struct *task)
393{
394 struct perf_counter_context *ctx = &task->perf_counter_ctx;
395
396 spin_lock_init(&ctx->lock);
397 INIT_LIST_HEAD(&ctx->counters);
398 ctx->nr_counters = 0;
399 ctx->task = task;
400}
401
402/*
403 * Cross CPU call to read the hardware counter
404 */
405static void __hw_perf_counter_read(void *info)
406{
407 hw_perf_counter_read(info);
408}
409
410static u64 perf_read_counter(struct perf_counter *counter)
411{
412 /*
413 * If counter is enabled and currently active on a CPU, update the
414 * value in the counter structure:
415 */
416 if (counter->active) {
417 smp_call_function_single(counter->oncpu,
418 __hw_perf_counter_read, counter, 1);
419 }
420
421 return perf_read_counter_safe(counter);
422}
423
424/*
425 * Cross CPU call to switch performance data pointers
426 */
427static void __perf_switch_irq_data(void *info)
428{
429 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
430 struct perf_counter *counter = info;
431 struct perf_counter_context *ctx = counter->ctx;
432 struct perf_data *oldirqdata = counter->irqdata;
433
434 /*
435 * If this is a task context, we need to check whether it is
436 * the current task context of this cpu. If not it has been
437 * scheduled out before the smp call arrived.
438 */
439 if (ctx->task) {
440 if (cpuctx->task_ctx != ctx)
441 return;
442 spin_lock(&ctx->lock);
443 }
444
445 /* Change the pointer NMI safe */
446 atomic_long_set((atomic_long_t *)&counter->irqdata,
447 (unsigned long) counter->usrdata);
448 counter->usrdata = oldirqdata;
449
450 if (ctx->task)
451 spin_unlock(&ctx->lock);
452}
453
454static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
455{
456 struct perf_counter_context *ctx = counter->ctx;
457 struct perf_data *oldirqdata = counter->irqdata;
458 struct task_struct *task = ctx->task;
459
460 if (!task) {
461 smp_call_function_single(counter->cpu,
462 __perf_switch_irq_data,
463 counter, 1);
464 return counter->usrdata;
465 }
466
467retry:
468 spin_lock_irq(&ctx->lock);
469 if (!counter->active) {
470 counter->irqdata = counter->usrdata;
471 counter->usrdata = oldirqdata;
472 spin_unlock_irq(&ctx->lock);
473 return oldirqdata;
474 }
475 spin_unlock_irq(&ctx->lock);
476 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
477 /* Might have failed, because task was scheduled out */
478 if (counter->irqdata == oldirqdata)
479 goto retry;
480
481 return counter->usrdata;
482}
483
484static void put_context(struct perf_counter_context *ctx)
485{
486 if (ctx->task)
487 put_task_struct(ctx->task);
488}
489
490static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
491{
492 struct perf_cpu_context *cpuctx;
493 struct perf_counter_context *ctx;
494 struct task_struct *task;
495
496 /*
497 * If cpu is not a wildcard then this is a percpu counter:
498 */
499 if (cpu != -1) {
500 /* Must be root to operate on a CPU counter: */
501 if (!capable(CAP_SYS_ADMIN))
502 return ERR_PTR(-EACCES);
503
504 if (cpu < 0 || cpu > num_possible_cpus())
505 return ERR_PTR(-EINVAL);
506
507 /*
508 * We could be clever and allow to attach a counter to an
509 * offline CPU and activate it when the CPU comes up, but
510 * that's for later.
511 */
512 if (!cpu_isset(cpu, cpu_online_map))
513 return ERR_PTR(-ENODEV);
514
515 cpuctx = &per_cpu(perf_cpu_context, cpu);
516 ctx = &cpuctx->ctx;
517
518 WARN_ON_ONCE(ctx->task);
519 return ctx;
520 }
521
522 rcu_read_lock();
523 if (!pid)
524 task = current;
525 else
526 task = find_task_by_vpid(pid);
527 if (task)
528 get_task_struct(task);
529 rcu_read_unlock();
530
531 if (!task)
532 return ERR_PTR(-ESRCH);
533
534 ctx = &task->perf_counter_ctx;
535 ctx->task = task;
536
537 /* Reuse ptrace permission checks for now. */
538 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
539 put_context(ctx);
540 return ERR_PTR(-EACCES);
541 }
542
543 return ctx;
544}
545
546/*
547 * Called when the last reference to the file is gone.
548 */
549static int perf_release(struct inode *inode, struct file *file)
550{
551 struct perf_counter *counter = file->private_data;
552 struct perf_counter_context *ctx = counter->ctx;
553
554 file->private_data = NULL;
555
556 mutex_lock(&counter->mutex);
557
558 perf_remove_from_context(counter);
559 put_context(ctx);
560
561 mutex_unlock(&counter->mutex);
562
563 kfree(counter);
564
565 return 0;
566}
567
568/*
569 * Read the performance counter - simple non blocking version for now
570 */
571static ssize_t
572perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
573{
574 u64 cntval;
575
576 if (count != sizeof(cntval))
577 return -EINVAL;
578
579 mutex_lock(&counter->mutex);
580 cntval = perf_read_counter(counter);
581 mutex_unlock(&counter->mutex);
582
583 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
584}
585
586static ssize_t
587perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
588{
589 if (!usrdata->len)
590 return 0;
591
592 count = min(count, (size_t)usrdata->len);
593 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
594 return -EFAULT;
595
596 /* Adjust the counters */
597 usrdata->len -= count;
598 if (!usrdata->len)
599 usrdata->rd_idx = 0;
600 else
601 usrdata->rd_idx += count;
602
603 return count;
604}
605
606static ssize_t
607perf_read_irq_data(struct perf_counter *counter,
608 char __user *buf,
609 size_t count,
610 int nonblocking)
611{
612 struct perf_data *irqdata, *usrdata;
613 DECLARE_WAITQUEUE(wait, current);
614 ssize_t res;
615
616 irqdata = counter->irqdata;
617 usrdata = counter->usrdata;
618
619 if (usrdata->len + irqdata->len >= count)
620 goto read_pending;
621
622 if (nonblocking)
623 return -EAGAIN;
624
625 spin_lock_irq(&counter->waitq.lock);
626 __add_wait_queue(&counter->waitq, &wait);
627 for (;;) {
628 set_current_state(TASK_INTERRUPTIBLE);
629 if (usrdata->len + irqdata->len >= count)
630 break;
631
632 if (signal_pending(current))
633 break;
634
635 spin_unlock_irq(&counter->waitq.lock);
636 schedule();
637 spin_lock_irq(&counter->waitq.lock);
638 }
639 __remove_wait_queue(&counter->waitq, &wait);
640 __set_current_state(TASK_RUNNING);
641 spin_unlock_irq(&counter->waitq.lock);
642
643 if (usrdata->len + irqdata->len < count)
644 return -ERESTARTSYS;
645read_pending:
646 mutex_lock(&counter->mutex);
647
648 /* Drain pending data first: */
649 res = perf_copy_usrdata(usrdata, buf, count);
650 if (res < 0 || res == count)
651 goto out;
652
653 /* Switch irq buffer: */
654 usrdata = perf_switch_irq_data(counter);
655 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
656 if (!res)
657 res = -EFAULT;
658 } else {
659 res = count;
660 }
661out:
662 mutex_unlock(&counter->mutex);
663
664 return res;
665}
666
667static ssize_t
668perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
669{
670 struct perf_counter *counter = file->private_data;
671
672 switch (counter->record_type) {
673 case PERF_RECORD_SIMPLE:
674 return perf_read_hw(counter, buf, count);
675
676 case PERF_RECORD_IRQ:
677 case PERF_RECORD_GROUP:
678 return perf_read_irq_data(counter, buf, count,
679 file->f_flags & O_NONBLOCK);
680 }
681 return -EINVAL;
682}
683
684static unsigned int perf_poll(struct file *file, poll_table *wait)
685{
686 struct perf_counter *counter = file->private_data;
687 unsigned int events = 0;
688 unsigned long flags;
689
690 poll_wait(file, &counter->waitq, wait);
691
692 spin_lock_irqsave(&counter->waitq.lock, flags);
693 if (counter->usrdata->len || counter->irqdata->len)
694 events |= POLLIN;
695 spin_unlock_irqrestore(&counter->waitq.lock, flags);
696
697 return events;
698}
699
700static const struct file_operations perf_fops = {
701 .release = perf_release,
702 .read = perf_read,
703 .poll = perf_poll,
704};
705
706/*
707 * Allocate and initialize a counter structure
708 */
709static struct perf_counter *
710perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
711{
712 struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
713
714 if (!counter)
715 return NULL;
716
717 mutex_init(&counter->mutex);
718 INIT_LIST_HEAD(&counter->list);
719 init_waitqueue_head(&counter->waitq);
720
721 counter->irqdata = &counter->data[0];
722 counter->usrdata = &counter->data[1];
723 counter->cpu = cpu;
724 counter->record_type = record_type;
725 counter->__irq_period = hw_event_period;
726 counter->wakeup_pending = 0;
727
728 return counter;
729}
730
731/**
732 * sys_perf_task_open - open a performance counter associate it to a task
733 * @hw_event_type: event type for monitoring/sampling...
734 * @pid: target pid
735 */
736asmlinkage int
737sys_perf_counter_open(u32 hw_event_type,
738 u32 hw_event_period,
739 u32 record_type,
740 pid_t pid,
741 int cpu)
742{
743 struct perf_counter_context *ctx;
744 struct perf_counter *counter;
745 int ret;
746
747 ctx = find_get_context(pid, cpu);
748 if (IS_ERR(ctx))
749 return PTR_ERR(ctx);
750
751 ret = -ENOMEM;
752 counter = perf_counter_alloc(hw_event_period, cpu, record_type);
753 if (!counter)
754 goto err_put_context;
755
756 ret = hw_perf_counter_init(counter, hw_event_type);
757 if (ret)
758 goto err_free_put_context;
759
760 perf_install_in_context(ctx, counter, cpu);
761
762 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
763 if (ret < 0)
764 goto err_remove_free_put_context;
765
766 return ret;
767
768err_remove_free_put_context:
769 mutex_lock(&counter->mutex);
770 perf_remove_from_context(counter);
771 mutex_unlock(&counter->mutex);
772
773err_free_put_context:
774 kfree(counter);
775
776err_put_context:
777 put_context(ctx);
778
779 return ret;
780}
781
782static void __cpuinit perf_init_cpu(int cpu)
783{
784 struct perf_cpu_context *ctx;
785
786 ctx = &per_cpu(perf_cpu_context, cpu);
787 spin_lock_init(&ctx->ctx.lock);
788 INIT_LIST_HEAD(&ctx->ctx.counters);
789
790 mutex_lock(&perf_resource_mutex);
791 ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
792 mutex_unlock(&perf_resource_mutex);
793 hw_perf_counter_setup();
794}
795
796#ifdef CONFIG_HOTPLUG_CPU
797static void __perf_exit_cpu(void *info)
798{
799 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
800 struct perf_counter_context *ctx = &cpuctx->ctx;
801 struct perf_counter *counter, *tmp;
802
803 list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
804 __perf_remove_from_context(counter);
805
806}
807static void perf_exit_cpu(int cpu)
808{
809 smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
810}
811#else
812static inline void perf_exit_cpu(int cpu) { }
813#endif
814
815static int __cpuinit
816perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
817{
818 unsigned int cpu = (long)hcpu;
819
820 switch (action) {
821
822 case CPU_UP_PREPARE:
823 case CPU_UP_PREPARE_FROZEN:
824 perf_init_cpu(cpu);
825 break;
826
827 case CPU_DOWN_PREPARE:
828 case CPU_DOWN_PREPARE_FROZEN:
829 perf_exit_cpu(cpu);
830 break;
831
832 default:
833 break;
834 }
835
836 return NOTIFY_OK;
837}
838
839static struct notifier_block __cpuinitdata perf_cpu_nb = {
840 .notifier_call = perf_cpu_notify,
841};
842
843static int __init perf_counter_init(void)
844{
845 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
846 (void *)(long)smp_processor_id());
847 register_cpu_notifier(&perf_cpu_nb);
848
849 return 0;
850}
851early_initcall(perf_counter_init);
852
853static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
854{
855 return sprintf(buf, "%d\n", perf_reserved_percpu);
856}
857
858static ssize_t
859perf_set_reserve_percpu(struct sysdev_class *class,
860 const char *buf,
861 size_t count)
862{
863 struct perf_cpu_context *cpuctx;
864 unsigned long val;
865 int err, cpu, mpt;
866
867 err = strict_strtoul(buf, 10, &val);
868 if (err)
869 return err;
870 if (val > perf_max_counters)
871 return -EINVAL;
872
873 mutex_lock(&perf_resource_mutex);
874 perf_reserved_percpu = val;
875 for_each_online_cpu(cpu) {
876 cpuctx = &per_cpu(perf_cpu_context, cpu);
877 spin_lock_irq(&cpuctx->ctx.lock);
878 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
879 perf_max_counters - perf_reserved_percpu);
880 cpuctx->max_pertask = mpt;
881 spin_unlock_irq(&cpuctx->ctx.lock);
882 }
883 mutex_unlock(&perf_resource_mutex);
884
885 return count;
886}
887
888static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
889{
890 return sprintf(buf, "%d\n", perf_overcommit);
891}
892
893static ssize_t
894perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
895{
896 unsigned long val;
897 int err;
898
899 err = strict_strtoul(buf, 10, &val);
900 if (err)
901 return err;
902 if (val > 1)
903 return -EINVAL;
904
905 mutex_lock(&perf_resource_mutex);
906 perf_overcommit = val;
907 mutex_unlock(&perf_resource_mutex);
908
909 return count;
910}
911
912static SYSDEV_CLASS_ATTR(
913 reserve_percpu,
914 0644,
915 perf_show_reserve_percpu,
916 perf_set_reserve_percpu
917 );
918
919static SYSDEV_CLASS_ATTR(
920 overcommit,
921 0644,
922 perf_show_overcommit,
923 perf_set_overcommit
924 );
925
926static struct attribute *perfclass_attrs[] = {
927 &attr_reserve_percpu.attr,
928 &attr_overcommit.attr,
929 NULL
930};
931
932static struct attribute_group perfclass_attr_group = {
933 .attrs = perfclass_attrs,
934 .name = "perf_counters",
935};
936
937static int __init perf_counter_sysfs_init(void)
938{
939 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
940 &perfclass_attr_group);
941}
942device_initcall(perf_counter_sysfs_init);
943
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3d..254d56de254 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag)
2212 2212
2213#endif /* CONFIG_SMP */ 2213#endif /* CONFIG_SMP */
2214 2214
2215/**
2216 * task_oncpu_function_call - call a function on the cpu on which a task runs
2217 * @p: the task to evaluate
2218 * @func: the function to be called
2219 * @info: the function call argument
2220 *
2221 * Calls the function @func when the task is currently running. This might
2222 * be on the current CPU, which just calls the function directly
2223 */
2224void task_oncpu_function_call(struct task_struct *p,
2225 void (*func) (void *info), void *info)
2226{
2227 int cpu;
2228
2229 preempt_disable();
2230 cpu = task_cpu(p);
2231 if (task_curr(p))
2232 smp_call_function_single(cpu, func, info, 1);
2233 preempt_enable();
2234}
2235
2215/*** 2236/***
2216 * try_to_wake_up - wake up a thread 2237 * try_to_wake_up - wake up a thread
2217 * @p: the to-be-woken-up thread 2238 * @p: the to-be-woken-up thread
@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2534 struct task_struct *next) 2555 struct task_struct *next)
2535{ 2556{
2536 fire_sched_out_preempt_notifiers(prev, next); 2557 fire_sched_out_preempt_notifiers(prev, next);
2558 perf_counter_task_sched_out(prev, cpu_of(rq));
2537 prepare_lock_switch(rq, next); 2559 prepare_lock_switch(rq, next);
2538 prepare_arch_switch(next); 2560 prepare_arch_switch(next);
2539} 2561}
@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2574 */ 2596 */
2575 prev_state = prev->state; 2597 prev_state = prev->state;
2576 finish_arch_switch(prev); 2598 finish_arch_switch(prev);
2599 perf_counter_task_sched_in(current, cpu_of(rq));
2577 finish_lock_switch(rq, prev); 2600 finish_lock_switch(rq, prev);
2578#ifdef CONFIG_SMP 2601#ifdef CONFIG_SMP
2579 if (current->sched_class->post_schedule) 2602 if (current->sched_class->post_schedule)
@@ -4296,6 +4319,7 @@ void scheduler_tick(void)
4296 rq->idle_at_tick = idle_cpu(cpu); 4319 rq->idle_at_tick = idle_cpu(cpu);
4297 trigger_load_balance(rq, cpu); 4320 trigger_load_balance(rq, cpu);
4298#endif 4321#endif
4322 perf_counter_task_tick(curr, cpu);
4299} 4323}
4300 4324
4301#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4325#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a2328170..4be8bbc7577 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
174cond_syscall(compat_sys_timerfd_gettime); 174cond_syscall(compat_sys_timerfd_gettime);
175cond_syscall(sys_eventfd); 175cond_syscall(sys_eventfd);
176cond_syscall(sys_eventfd2); 176cond_syscall(sys_eventfd2);
177
178/* performance counters: */
179cond_syscall(sys_perf_counter_open);