diff options
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r-- | kernel/perf_counter.c | 943 |
1 files changed, 943 insertions, 0 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 000000000000..20508f053658 --- /dev/null +++ b/kernel/perf_counter.c | |||
@@ -0,0 +1,943 @@ | |||
1 | /* | ||
2 | * Performance counter core code | ||
3 | * | ||
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | ||
6 | * | ||
7 | * For licencing details see kernel-base/COPYING | ||
8 | */ | ||
9 | |||
10 | #include <linux/fs.h> | ||
11 | #include <linux/cpu.h> | ||
12 | #include <linux/smp.h> | ||
13 | #include <linux/poll.h> | ||
14 | #include <linux/sysfs.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/percpu.h> | ||
17 | #include <linux/uaccess.h> | ||
18 | #include <linux/syscalls.h> | ||
19 | #include <linux/anon_inodes.h> | ||
20 | #include <linux/perf_counter.h> | ||
21 | |||
22 | /* | ||
23 | * Each CPU has a list of per CPU counters: | ||
24 | */ | ||
25 | DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
26 | |||
27 | int perf_max_counters __read_mostly; | ||
28 | static int perf_reserved_percpu __read_mostly; | ||
29 | static int perf_overcommit __read_mostly = 1; | ||
30 | |||
31 | /* | ||
32 | * Mutex for (sysadmin-configurable) counter reservations: | ||
33 | */ | ||
34 | static DEFINE_MUTEX(perf_resource_mutex); | ||
35 | |||
36 | /* | ||
37 | * Architecture provided APIs - weak aliases: | ||
38 | */ | ||
39 | |||
40 | int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) | ||
41 | { | ||
42 | return -EINVAL; | ||
43 | } | ||
44 | |||
45 | void __weak hw_perf_counter_enable(struct perf_counter *counter) { } | ||
46 | void __weak hw_perf_counter_disable(struct perf_counter *counter) { } | ||
47 | void __weak hw_perf_counter_read(struct perf_counter *counter) { } | ||
48 | void __weak hw_perf_disable_all(void) { } | ||
49 | void __weak hw_perf_enable_all(void) { } | ||
50 | void __weak hw_perf_counter_setup(void) { } | ||
51 | |||
52 | #if BITS_PER_LONG == 64 | ||
53 | |||
54 | /* | ||
55 | * Read the cached counter in counter safe against cross CPU / NMI | ||
56 | * modifications. 64 bit version - no complications. | ||
57 | */ | ||
58 | static inline u64 perf_read_counter_safe(struct perf_counter *counter) | ||
59 | { | ||
60 | return (u64) atomic64_read(&counter->count); | ||
61 | } | ||
62 | |||
63 | #else | ||
64 | |||
65 | /* | ||
66 | * Read the cached counter in counter safe against cross CPU / NMI | ||
67 | * modifications. 32 bit version. | ||
68 | */ | ||
69 | static u64 perf_read_counter_safe(struct perf_counter *counter) | ||
70 | { | ||
71 | u32 cntl, cnth; | ||
72 | |||
73 | local_irq_disable(); | ||
74 | do { | ||
75 | cnth = atomic_read(&counter->count32[1]); | ||
76 | cntl = atomic_read(&counter->count32[0]); | ||
77 | } while (cnth != atomic_read(&counter->count32[1])); | ||
78 | |||
79 | local_irq_enable(); | ||
80 | |||
81 | return cntl | ((u64) cnth) << 32; | ||
82 | } | ||
83 | |||
84 | #endif | ||
85 | |||
86 | /* | ||
87 | * Cross CPU call to remove a performance counter | ||
88 | * | ||
89 | * We disable the counter on the hardware level first. After that we | ||
90 | * remove it from the context list. | ||
91 | */ | ||
92 | static void __perf_remove_from_context(void *info) | ||
93 | { | ||
94 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
95 | struct perf_counter *counter = info; | ||
96 | struct perf_counter_context *ctx = counter->ctx; | ||
97 | |||
98 | /* | ||
99 | * If this is a task context, we need to check whether it is | ||
100 | * the current task context of this cpu. If not it has been | ||
101 | * scheduled out before the smp call arrived. | ||
102 | */ | ||
103 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
104 | return; | ||
105 | |||
106 | spin_lock(&ctx->lock); | ||
107 | |||
108 | if (counter->active) { | ||
109 | hw_perf_counter_disable(counter); | ||
110 | counter->active = 0; | ||
111 | ctx->nr_active--; | ||
112 | cpuctx->active_oncpu--; | ||
113 | counter->task = NULL; | ||
114 | } | ||
115 | ctx->nr_counters--; | ||
116 | |||
117 | /* | ||
118 | * Protect the list operation against NMI by disabling the | ||
119 | * counters on a global level. NOP for non NMI based counters. | ||
120 | */ | ||
121 | hw_perf_disable_all(); | ||
122 | list_del_init(&counter->list); | ||
123 | hw_perf_enable_all(); | ||
124 | |||
125 | if (!ctx->task) { | ||
126 | /* | ||
127 | * Allow more per task counters with respect to the | ||
128 | * reservation: | ||
129 | */ | ||
130 | cpuctx->max_pertask = | ||
131 | min(perf_max_counters - ctx->nr_counters, | ||
132 | perf_max_counters - perf_reserved_percpu); | ||
133 | } | ||
134 | |||
135 | spin_unlock(&ctx->lock); | ||
136 | } | ||
137 | |||
138 | |||
139 | /* | ||
140 | * Remove the counter from a task's (or a CPU's) list of counters. | ||
141 | * | ||
142 | * Must be called with counter->mutex held. | ||
143 | * | ||
144 | * CPU counters are removed with a smp call. For task counters we only | ||
145 | * call when the task is on a CPU. | ||
146 | */ | ||
147 | static void perf_remove_from_context(struct perf_counter *counter) | ||
148 | { | ||
149 | struct perf_counter_context *ctx = counter->ctx; | ||
150 | struct task_struct *task = ctx->task; | ||
151 | |||
152 | if (!task) { | ||
153 | /* | ||
154 | * Per cpu counters are removed via an smp call and | ||
155 | * the removal is always sucessful. | ||
156 | */ | ||
157 | smp_call_function_single(counter->cpu, | ||
158 | __perf_remove_from_context, | ||
159 | counter, 1); | ||
160 | return; | ||
161 | } | ||
162 | |||
163 | retry: | ||
164 | task_oncpu_function_call(task, __perf_remove_from_context, | ||
165 | counter); | ||
166 | |||
167 | spin_lock_irq(&ctx->lock); | ||
168 | /* | ||
169 | * If the context is active we need to retry the smp call. | ||
170 | */ | ||
171 | if (ctx->nr_active && !list_empty(&counter->list)) { | ||
172 | spin_unlock_irq(&ctx->lock); | ||
173 | goto retry; | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * The lock prevents that this context is scheduled in so we | ||
178 | * can remove the counter safely, if it the call above did not | ||
179 | * succeed. | ||
180 | */ | ||
181 | if (!list_empty(&counter->list)) { | ||
182 | ctx->nr_counters--; | ||
183 | list_del_init(&counter->list); | ||
184 | counter->task = NULL; | ||
185 | } | ||
186 | spin_unlock_irq(&ctx->lock); | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * Cross CPU call to install and enable a preformance counter | ||
191 | */ | ||
192 | static void __perf_install_in_context(void *info) | ||
193 | { | ||
194 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
195 | struct perf_counter *counter = info; | ||
196 | struct perf_counter_context *ctx = counter->ctx; | ||
197 | int cpu = smp_processor_id(); | ||
198 | |||
199 | /* | ||
200 | * If this is a task context, we need to check whether it is | ||
201 | * the current task context of this cpu. If not it has been | ||
202 | * scheduled out before the smp call arrived. | ||
203 | */ | ||
204 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
205 | return; | ||
206 | |||
207 | spin_lock(&ctx->lock); | ||
208 | |||
209 | /* | ||
210 | * Protect the list operation against NMI by disabling the | ||
211 | * counters on a global level. NOP for non NMI based counters. | ||
212 | */ | ||
213 | hw_perf_disable_all(); | ||
214 | list_add_tail(&counter->list, &ctx->counters); | ||
215 | hw_perf_enable_all(); | ||
216 | |||
217 | ctx->nr_counters++; | ||
218 | |||
219 | if (cpuctx->active_oncpu < perf_max_counters) { | ||
220 | hw_perf_counter_enable(counter); | ||
221 | counter->active = 1; | ||
222 | counter->oncpu = cpu; | ||
223 | ctx->nr_active++; | ||
224 | cpuctx->active_oncpu++; | ||
225 | } | ||
226 | |||
227 | if (!ctx->task && cpuctx->max_pertask) | ||
228 | cpuctx->max_pertask--; | ||
229 | |||
230 | spin_unlock(&ctx->lock); | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * Attach a performance counter to a context | ||
235 | * | ||
236 | * First we add the counter to the list with the hardware enable bit | ||
237 | * in counter->hw_config cleared. | ||
238 | * | ||
239 | * If the counter is attached to a task which is on a CPU we use a smp | ||
240 | * call to enable it in the task context. The task might have been | ||
241 | * scheduled away, but we check this in the smp call again. | ||
242 | */ | ||
243 | static void | ||
244 | perf_install_in_context(struct perf_counter_context *ctx, | ||
245 | struct perf_counter *counter, | ||
246 | int cpu) | ||
247 | { | ||
248 | struct task_struct *task = ctx->task; | ||
249 | |||
250 | counter->ctx = ctx; | ||
251 | if (!task) { | ||
252 | /* | ||
253 | * Per cpu counters are installed via an smp call and | ||
254 | * the install is always sucessful. | ||
255 | */ | ||
256 | smp_call_function_single(cpu, __perf_install_in_context, | ||
257 | counter, 1); | ||
258 | return; | ||
259 | } | ||
260 | |||
261 | counter->task = task; | ||
262 | retry: | ||
263 | task_oncpu_function_call(task, __perf_install_in_context, | ||
264 | counter); | ||
265 | |||
266 | spin_lock_irq(&ctx->lock); | ||
267 | /* | ||
268 | * If the context is active and the counter has not been added | ||
269 | * we need to retry the smp call. | ||
270 | */ | ||
271 | if (ctx->nr_active && list_empty(&counter->list)) { | ||
272 | spin_unlock_irq(&ctx->lock); | ||
273 | goto retry; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * The lock prevents that this context is scheduled in so we | ||
278 | * can add the counter safely, if it the call above did not | ||
279 | * succeed. | ||
280 | */ | ||
281 | if (list_empty(&counter->list)) { | ||
282 | list_add_tail(&counter->list, &ctx->counters); | ||
283 | ctx->nr_counters++; | ||
284 | } | ||
285 | spin_unlock_irq(&ctx->lock); | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Called from scheduler to remove the counters of the current task, | ||
290 | * with interrupts disabled. | ||
291 | * | ||
292 | * We stop each counter and update the counter value in counter->count. | ||
293 | * | ||
294 | * This does not protect us against NMI, but hw_perf_counter_disable() | ||
295 | * sets the disabled bit in the control field of counter _before_ | ||
296 | * accessing the counter control register. If a NMI hits, then it will | ||
297 | * not restart the counter. | ||
298 | */ | ||
299 | void perf_counter_task_sched_out(struct task_struct *task, int cpu) | ||
300 | { | ||
301 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
302 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
303 | struct perf_counter *counter; | ||
304 | |||
305 | if (likely(!cpuctx->task_ctx)) | ||
306 | return; | ||
307 | |||
308 | spin_lock(&ctx->lock); | ||
309 | list_for_each_entry(counter, &ctx->counters, list) { | ||
310 | if (!ctx->nr_active) | ||
311 | break; | ||
312 | if (counter->active) { | ||
313 | hw_perf_counter_disable(counter); | ||
314 | counter->active = 0; | ||
315 | counter->oncpu = -1; | ||
316 | ctx->nr_active--; | ||
317 | cpuctx->active_oncpu--; | ||
318 | } | ||
319 | } | ||
320 | spin_unlock(&ctx->lock); | ||
321 | cpuctx->task_ctx = NULL; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Called from scheduler to add the counters of the current task | ||
326 | * with interrupts disabled. | ||
327 | * | ||
328 | * We restore the counter value and then enable it. | ||
329 | * | ||
330 | * This does not protect us against NMI, but hw_perf_counter_enable() | ||
331 | * sets the enabled bit in the control field of counter _before_ | ||
332 | * accessing the counter control register. If a NMI hits, then it will | ||
333 | * keep the counter running. | ||
334 | */ | ||
335 | void perf_counter_task_sched_in(struct task_struct *task, int cpu) | ||
336 | { | ||
337 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
338 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
339 | struct perf_counter *counter; | ||
340 | |||
341 | if (likely(!ctx->nr_counters)) | ||
342 | return; | ||
343 | |||
344 | spin_lock(&ctx->lock); | ||
345 | list_for_each_entry(counter, &ctx->counters, list) { | ||
346 | if (ctx->nr_active == cpuctx->max_pertask) | ||
347 | break; | ||
348 | if (counter->cpu != -1 && counter->cpu != cpu) | ||
349 | continue; | ||
350 | |||
351 | hw_perf_counter_enable(counter); | ||
352 | counter->active = 1; | ||
353 | counter->oncpu = cpu; | ||
354 | ctx->nr_active++; | ||
355 | cpuctx->active_oncpu++; | ||
356 | } | ||
357 | spin_unlock(&ctx->lock); | ||
358 | cpuctx->task_ctx = ctx; | ||
359 | } | ||
360 | |||
361 | void perf_counter_task_tick(struct task_struct *curr, int cpu) | ||
362 | { | ||
363 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
364 | struct perf_counter *counter; | ||
365 | |||
366 | if (likely(!ctx->nr_counters)) | ||
367 | return; | ||
368 | |||
369 | perf_counter_task_sched_out(curr, cpu); | ||
370 | |||
371 | spin_lock(&ctx->lock); | ||
372 | |||
373 | /* | ||
374 | * Rotate the first entry last: | ||
375 | */ | ||
376 | hw_perf_disable_all(); | ||
377 | list_for_each_entry(counter, &ctx->counters, list) { | ||
378 | list_del(&counter->list); | ||
379 | list_add_tail(&counter->list, &ctx->counters); | ||
380 | break; | ||
381 | } | ||
382 | hw_perf_enable_all(); | ||
383 | |||
384 | spin_unlock(&ctx->lock); | ||
385 | |||
386 | perf_counter_task_sched_in(curr, cpu); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Initialize the perf_counter context in task_struct | ||
391 | */ | ||
392 | void perf_counter_init_task(struct task_struct *task) | ||
393 | { | ||
394 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
395 | |||
396 | spin_lock_init(&ctx->lock); | ||
397 | INIT_LIST_HEAD(&ctx->counters); | ||
398 | ctx->nr_counters = 0; | ||
399 | ctx->task = task; | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Cross CPU call to read the hardware counter | ||
404 | */ | ||
405 | static void __hw_perf_counter_read(void *info) | ||
406 | { | ||
407 | hw_perf_counter_read(info); | ||
408 | } | ||
409 | |||
410 | static u64 perf_read_counter(struct perf_counter *counter) | ||
411 | { | ||
412 | /* | ||
413 | * If counter is enabled and currently active on a CPU, update the | ||
414 | * value in the counter structure: | ||
415 | */ | ||
416 | if (counter->active) { | ||
417 | smp_call_function_single(counter->oncpu, | ||
418 | __hw_perf_counter_read, counter, 1); | ||
419 | } | ||
420 | |||
421 | return perf_read_counter_safe(counter); | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * Cross CPU call to switch performance data pointers | ||
426 | */ | ||
427 | static void __perf_switch_irq_data(void *info) | ||
428 | { | ||
429 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
430 | struct perf_counter *counter = info; | ||
431 | struct perf_counter_context *ctx = counter->ctx; | ||
432 | struct perf_data *oldirqdata = counter->irqdata; | ||
433 | |||
434 | /* | ||
435 | * If this is a task context, we need to check whether it is | ||
436 | * the current task context of this cpu. If not it has been | ||
437 | * scheduled out before the smp call arrived. | ||
438 | */ | ||
439 | if (ctx->task) { | ||
440 | if (cpuctx->task_ctx != ctx) | ||
441 | return; | ||
442 | spin_lock(&ctx->lock); | ||
443 | } | ||
444 | |||
445 | /* Change the pointer NMI safe */ | ||
446 | atomic_long_set((atomic_long_t *)&counter->irqdata, | ||
447 | (unsigned long) counter->usrdata); | ||
448 | counter->usrdata = oldirqdata; | ||
449 | |||
450 | if (ctx->task) | ||
451 | spin_unlock(&ctx->lock); | ||
452 | } | ||
453 | |||
454 | static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) | ||
455 | { | ||
456 | struct perf_counter_context *ctx = counter->ctx; | ||
457 | struct perf_data *oldirqdata = counter->irqdata; | ||
458 | struct task_struct *task = ctx->task; | ||
459 | |||
460 | if (!task) { | ||
461 | smp_call_function_single(counter->cpu, | ||
462 | __perf_switch_irq_data, | ||
463 | counter, 1); | ||
464 | return counter->usrdata; | ||
465 | } | ||
466 | |||
467 | retry: | ||
468 | spin_lock_irq(&ctx->lock); | ||
469 | if (!counter->active) { | ||
470 | counter->irqdata = counter->usrdata; | ||
471 | counter->usrdata = oldirqdata; | ||
472 | spin_unlock_irq(&ctx->lock); | ||
473 | return oldirqdata; | ||
474 | } | ||
475 | spin_unlock_irq(&ctx->lock); | ||
476 | task_oncpu_function_call(task, __perf_switch_irq_data, counter); | ||
477 | /* Might have failed, because task was scheduled out */ | ||
478 | if (counter->irqdata == oldirqdata) | ||
479 | goto retry; | ||
480 | |||
481 | return counter->usrdata; | ||
482 | } | ||
483 | |||
484 | static void put_context(struct perf_counter_context *ctx) | ||
485 | { | ||
486 | if (ctx->task) | ||
487 | put_task_struct(ctx->task); | ||
488 | } | ||
489 | |||
490 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | ||
491 | { | ||
492 | struct perf_cpu_context *cpuctx; | ||
493 | struct perf_counter_context *ctx; | ||
494 | struct task_struct *task; | ||
495 | |||
496 | /* | ||
497 | * If cpu is not a wildcard then this is a percpu counter: | ||
498 | */ | ||
499 | if (cpu != -1) { | ||
500 | /* Must be root to operate on a CPU counter: */ | ||
501 | if (!capable(CAP_SYS_ADMIN)) | ||
502 | return ERR_PTR(-EACCES); | ||
503 | |||
504 | if (cpu < 0 || cpu > num_possible_cpus()) | ||
505 | return ERR_PTR(-EINVAL); | ||
506 | |||
507 | /* | ||
508 | * We could be clever and allow to attach a counter to an | ||
509 | * offline CPU and activate it when the CPU comes up, but | ||
510 | * that's for later. | ||
511 | */ | ||
512 | if (!cpu_isset(cpu, cpu_online_map)) | ||
513 | return ERR_PTR(-ENODEV); | ||
514 | |||
515 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
516 | ctx = &cpuctx->ctx; | ||
517 | |||
518 | WARN_ON_ONCE(ctx->task); | ||
519 | return ctx; | ||
520 | } | ||
521 | |||
522 | rcu_read_lock(); | ||
523 | if (!pid) | ||
524 | task = current; | ||
525 | else | ||
526 | task = find_task_by_vpid(pid); | ||
527 | if (task) | ||
528 | get_task_struct(task); | ||
529 | rcu_read_unlock(); | ||
530 | |||
531 | if (!task) | ||
532 | return ERR_PTR(-ESRCH); | ||
533 | |||
534 | ctx = &task->perf_counter_ctx; | ||
535 | ctx->task = task; | ||
536 | |||
537 | /* Reuse ptrace permission checks for now. */ | ||
538 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | ||
539 | put_context(ctx); | ||
540 | return ERR_PTR(-EACCES); | ||
541 | } | ||
542 | |||
543 | return ctx; | ||
544 | } | ||
545 | |||
546 | /* | ||
547 | * Called when the last reference to the file is gone. | ||
548 | */ | ||
549 | static int perf_release(struct inode *inode, struct file *file) | ||
550 | { | ||
551 | struct perf_counter *counter = file->private_data; | ||
552 | struct perf_counter_context *ctx = counter->ctx; | ||
553 | |||
554 | file->private_data = NULL; | ||
555 | |||
556 | mutex_lock(&counter->mutex); | ||
557 | |||
558 | perf_remove_from_context(counter); | ||
559 | put_context(ctx); | ||
560 | |||
561 | mutex_unlock(&counter->mutex); | ||
562 | |||
563 | kfree(counter); | ||
564 | |||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Read the performance counter - simple non blocking version for now | ||
570 | */ | ||
571 | static ssize_t | ||
572 | perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | ||
573 | { | ||
574 | u64 cntval; | ||
575 | |||
576 | if (count != sizeof(cntval)) | ||
577 | return -EINVAL; | ||
578 | |||
579 | mutex_lock(&counter->mutex); | ||
580 | cntval = perf_read_counter(counter); | ||
581 | mutex_unlock(&counter->mutex); | ||
582 | |||
583 | return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); | ||
584 | } | ||
585 | |||
586 | static ssize_t | ||
587 | perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) | ||
588 | { | ||
589 | if (!usrdata->len) | ||
590 | return 0; | ||
591 | |||
592 | count = min(count, (size_t)usrdata->len); | ||
593 | if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) | ||
594 | return -EFAULT; | ||
595 | |||
596 | /* Adjust the counters */ | ||
597 | usrdata->len -= count; | ||
598 | if (!usrdata->len) | ||
599 | usrdata->rd_idx = 0; | ||
600 | else | ||
601 | usrdata->rd_idx += count; | ||
602 | |||
603 | return count; | ||
604 | } | ||
605 | |||
606 | static ssize_t | ||
607 | perf_read_irq_data(struct perf_counter *counter, | ||
608 | char __user *buf, | ||
609 | size_t count, | ||
610 | int nonblocking) | ||
611 | { | ||
612 | struct perf_data *irqdata, *usrdata; | ||
613 | DECLARE_WAITQUEUE(wait, current); | ||
614 | ssize_t res; | ||
615 | |||
616 | irqdata = counter->irqdata; | ||
617 | usrdata = counter->usrdata; | ||
618 | |||
619 | if (usrdata->len + irqdata->len >= count) | ||
620 | goto read_pending; | ||
621 | |||
622 | if (nonblocking) | ||
623 | return -EAGAIN; | ||
624 | |||
625 | spin_lock_irq(&counter->waitq.lock); | ||
626 | __add_wait_queue(&counter->waitq, &wait); | ||
627 | for (;;) { | ||
628 | set_current_state(TASK_INTERRUPTIBLE); | ||
629 | if (usrdata->len + irqdata->len >= count) | ||
630 | break; | ||
631 | |||
632 | if (signal_pending(current)) | ||
633 | break; | ||
634 | |||
635 | spin_unlock_irq(&counter->waitq.lock); | ||
636 | schedule(); | ||
637 | spin_lock_irq(&counter->waitq.lock); | ||
638 | } | ||
639 | __remove_wait_queue(&counter->waitq, &wait); | ||
640 | __set_current_state(TASK_RUNNING); | ||
641 | spin_unlock_irq(&counter->waitq.lock); | ||
642 | |||
643 | if (usrdata->len + irqdata->len < count) | ||
644 | return -ERESTARTSYS; | ||
645 | read_pending: | ||
646 | mutex_lock(&counter->mutex); | ||
647 | |||
648 | /* Drain pending data first: */ | ||
649 | res = perf_copy_usrdata(usrdata, buf, count); | ||
650 | if (res < 0 || res == count) | ||
651 | goto out; | ||
652 | |||
653 | /* Switch irq buffer: */ | ||
654 | usrdata = perf_switch_irq_data(counter); | ||
655 | if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { | ||
656 | if (!res) | ||
657 | res = -EFAULT; | ||
658 | } else { | ||
659 | res = count; | ||
660 | } | ||
661 | out: | ||
662 | mutex_unlock(&counter->mutex); | ||
663 | |||
664 | return res; | ||
665 | } | ||
666 | |||
667 | static ssize_t | ||
668 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
669 | { | ||
670 | struct perf_counter *counter = file->private_data; | ||
671 | |||
672 | switch (counter->record_type) { | ||
673 | case PERF_RECORD_SIMPLE: | ||
674 | return perf_read_hw(counter, buf, count); | ||
675 | |||
676 | case PERF_RECORD_IRQ: | ||
677 | case PERF_RECORD_GROUP: | ||
678 | return perf_read_irq_data(counter, buf, count, | ||
679 | file->f_flags & O_NONBLOCK); | ||
680 | } | ||
681 | return -EINVAL; | ||
682 | } | ||
683 | |||
684 | static unsigned int perf_poll(struct file *file, poll_table *wait) | ||
685 | { | ||
686 | struct perf_counter *counter = file->private_data; | ||
687 | unsigned int events = 0; | ||
688 | unsigned long flags; | ||
689 | |||
690 | poll_wait(file, &counter->waitq, wait); | ||
691 | |||
692 | spin_lock_irqsave(&counter->waitq.lock, flags); | ||
693 | if (counter->usrdata->len || counter->irqdata->len) | ||
694 | events |= POLLIN; | ||
695 | spin_unlock_irqrestore(&counter->waitq.lock, flags); | ||
696 | |||
697 | return events; | ||
698 | } | ||
699 | |||
700 | static const struct file_operations perf_fops = { | ||
701 | .release = perf_release, | ||
702 | .read = perf_read, | ||
703 | .poll = perf_poll, | ||
704 | }; | ||
705 | |||
706 | /* | ||
707 | * Allocate and initialize a counter structure | ||
708 | */ | ||
709 | static struct perf_counter * | ||
710 | perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) | ||
711 | { | ||
712 | struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); | ||
713 | |||
714 | if (!counter) | ||
715 | return NULL; | ||
716 | |||
717 | mutex_init(&counter->mutex); | ||
718 | INIT_LIST_HEAD(&counter->list); | ||
719 | init_waitqueue_head(&counter->waitq); | ||
720 | |||
721 | counter->irqdata = &counter->data[0]; | ||
722 | counter->usrdata = &counter->data[1]; | ||
723 | counter->cpu = cpu; | ||
724 | counter->record_type = record_type; | ||
725 | counter->__irq_period = hw_event_period; | ||
726 | counter->wakeup_pending = 0; | ||
727 | |||
728 | return counter; | ||
729 | } | ||
730 | |||
731 | /** | ||
732 | * sys_perf_task_open - open a performance counter associate it to a task | ||
733 | * @hw_event_type: event type for monitoring/sampling... | ||
734 | * @pid: target pid | ||
735 | */ | ||
736 | asmlinkage int | ||
737 | sys_perf_counter_open(u32 hw_event_type, | ||
738 | u32 hw_event_period, | ||
739 | u32 record_type, | ||
740 | pid_t pid, | ||
741 | int cpu) | ||
742 | { | ||
743 | struct perf_counter_context *ctx; | ||
744 | struct perf_counter *counter; | ||
745 | int ret; | ||
746 | |||
747 | ctx = find_get_context(pid, cpu); | ||
748 | if (IS_ERR(ctx)) | ||
749 | return PTR_ERR(ctx); | ||
750 | |||
751 | ret = -ENOMEM; | ||
752 | counter = perf_counter_alloc(hw_event_period, cpu, record_type); | ||
753 | if (!counter) | ||
754 | goto err_put_context; | ||
755 | |||
756 | ret = hw_perf_counter_init(counter, hw_event_type); | ||
757 | if (ret) | ||
758 | goto err_free_put_context; | ||
759 | |||
760 | perf_install_in_context(ctx, counter, cpu); | ||
761 | |||
762 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | ||
763 | if (ret < 0) | ||
764 | goto err_remove_free_put_context; | ||
765 | |||
766 | return ret; | ||
767 | |||
768 | err_remove_free_put_context: | ||
769 | mutex_lock(&counter->mutex); | ||
770 | perf_remove_from_context(counter); | ||
771 | mutex_unlock(&counter->mutex); | ||
772 | |||
773 | err_free_put_context: | ||
774 | kfree(counter); | ||
775 | |||
776 | err_put_context: | ||
777 | put_context(ctx); | ||
778 | |||
779 | return ret; | ||
780 | } | ||
781 | |||
782 | static void __cpuinit perf_init_cpu(int cpu) | ||
783 | { | ||
784 | struct perf_cpu_context *ctx; | ||
785 | |||
786 | ctx = &per_cpu(perf_cpu_context, cpu); | ||
787 | spin_lock_init(&ctx->ctx.lock); | ||
788 | INIT_LIST_HEAD(&ctx->ctx.counters); | ||
789 | |||
790 | mutex_lock(&perf_resource_mutex); | ||
791 | ctx->max_pertask = perf_max_counters - perf_reserved_percpu; | ||
792 | mutex_unlock(&perf_resource_mutex); | ||
793 | hw_perf_counter_setup(); | ||
794 | } | ||
795 | |||
796 | #ifdef CONFIG_HOTPLUG_CPU | ||
797 | static void __perf_exit_cpu(void *info) | ||
798 | { | ||
799 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
800 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
801 | struct perf_counter *counter, *tmp; | ||
802 | |||
803 | list_for_each_entry_safe(counter, tmp, &ctx->counters, list) | ||
804 | __perf_remove_from_context(counter); | ||
805 | |||
806 | } | ||
807 | static void perf_exit_cpu(int cpu) | ||
808 | { | ||
809 | smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); | ||
810 | } | ||
811 | #else | ||
812 | static inline void perf_exit_cpu(int cpu) { } | ||
813 | #endif | ||
814 | |||
815 | static int __cpuinit | ||
816 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | ||
817 | { | ||
818 | unsigned int cpu = (long)hcpu; | ||
819 | |||
820 | switch (action) { | ||
821 | |||
822 | case CPU_UP_PREPARE: | ||
823 | case CPU_UP_PREPARE_FROZEN: | ||
824 | perf_init_cpu(cpu); | ||
825 | break; | ||
826 | |||
827 | case CPU_DOWN_PREPARE: | ||
828 | case CPU_DOWN_PREPARE_FROZEN: | ||
829 | perf_exit_cpu(cpu); | ||
830 | break; | ||
831 | |||
832 | default: | ||
833 | break; | ||
834 | } | ||
835 | |||
836 | return NOTIFY_OK; | ||
837 | } | ||
838 | |||
839 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
840 | .notifier_call = perf_cpu_notify, | ||
841 | }; | ||
842 | |||
843 | static int __init perf_counter_init(void) | ||
844 | { | ||
845 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | ||
846 | (void *)(long)smp_processor_id()); | ||
847 | register_cpu_notifier(&perf_cpu_nb); | ||
848 | |||
849 | return 0; | ||
850 | } | ||
851 | early_initcall(perf_counter_init); | ||
852 | |||
853 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | ||
854 | { | ||
855 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
856 | } | ||
857 | |||
858 | static ssize_t | ||
859 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
860 | const char *buf, | ||
861 | size_t count) | ||
862 | { | ||
863 | struct perf_cpu_context *cpuctx; | ||
864 | unsigned long val; | ||
865 | int err, cpu, mpt; | ||
866 | |||
867 | err = strict_strtoul(buf, 10, &val); | ||
868 | if (err) | ||
869 | return err; | ||
870 | if (val > perf_max_counters) | ||
871 | return -EINVAL; | ||
872 | |||
873 | mutex_lock(&perf_resource_mutex); | ||
874 | perf_reserved_percpu = val; | ||
875 | for_each_online_cpu(cpu) { | ||
876 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
877 | spin_lock_irq(&cpuctx->ctx.lock); | ||
878 | mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, | ||
879 | perf_max_counters - perf_reserved_percpu); | ||
880 | cpuctx->max_pertask = mpt; | ||
881 | spin_unlock_irq(&cpuctx->ctx.lock); | ||
882 | } | ||
883 | mutex_unlock(&perf_resource_mutex); | ||
884 | |||
885 | return count; | ||
886 | } | ||
887 | |||
888 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | ||
889 | { | ||
890 | return sprintf(buf, "%d\n", perf_overcommit); | ||
891 | } | ||
892 | |||
893 | static ssize_t | ||
894 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | ||
895 | { | ||
896 | unsigned long val; | ||
897 | int err; | ||
898 | |||
899 | err = strict_strtoul(buf, 10, &val); | ||
900 | if (err) | ||
901 | return err; | ||
902 | if (val > 1) | ||
903 | return -EINVAL; | ||
904 | |||
905 | mutex_lock(&perf_resource_mutex); | ||
906 | perf_overcommit = val; | ||
907 | mutex_unlock(&perf_resource_mutex); | ||
908 | |||
909 | return count; | ||
910 | } | ||
911 | |||
912 | static SYSDEV_CLASS_ATTR( | ||
913 | reserve_percpu, | ||
914 | 0644, | ||
915 | perf_show_reserve_percpu, | ||
916 | perf_set_reserve_percpu | ||
917 | ); | ||
918 | |||
919 | static SYSDEV_CLASS_ATTR( | ||
920 | overcommit, | ||
921 | 0644, | ||
922 | perf_show_overcommit, | ||
923 | perf_set_overcommit | ||
924 | ); | ||
925 | |||
926 | static struct attribute *perfclass_attrs[] = { | ||
927 | &attr_reserve_percpu.attr, | ||
928 | &attr_overcommit.attr, | ||
929 | NULL | ||
930 | }; | ||
931 | |||
932 | static struct attribute_group perfclass_attr_group = { | ||
933 | .attrs = perfclass_attrs, | ||
934 | .name = "perf_counters", | ||
935 | }; | ||
936 | |||
937 | static int __init perf_counter_sysfs_init(void) | ||
938 | { | ||
939 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
940 | &perfclass_attr_group); | ||
941 | } | ||
942 | device_initcall(perf_counter_sysfs_init); | ||
943 | |||