aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
authorBorislav Petkov <borislav.petkov@amd.com>2010-10-26 14:24:03 -0400
committerBorislav Petkov <borislav.petkov@amd.com>2011-05-03 06:59:43 -0400
commitfae85b7c8bcc7de9c0a2698587e20c15beb7d5a6 (patch)
tree96ea37ea08d52b2ef89f823f6e43ba8b15cc66bb /kernel/events
parentac0a3260f37b8616da8d33488ec94b94e6ae5b31 (diff)
perf: Start the restructuring
mv kernel/perf_event.c -> kernel/events/core.c. From there, all further sensible splitting can happen. The idea is that due to perf_event.c becoming pretty sizable and with the advent of the marriage with ftrace, splitting functionality into its logical parts should help speeding up the unification and to manage the complexity of the subsystem. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/Makefile5
-rw-r--r--kernel/events/core.c7455
2 files changed, 7460 insertions, 0 deletions
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..26c00e4570e5
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y += core.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
new file mode 100644
index 000000000000..440bc485bbff
--- /dev/null
+++ b/kernel/events/core.c
@@ -0,0 +1,7455 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/vmalloc.h>
29#include <linux/hardirq.h>
30#include <linux/rculist.h>
31#include <linux/uaccess.h>
32#include <linux/syscalls.h>
33#include <linux/anon_inodes.h>
34#include <linux/kernel_stat.h>
35#include <linux/perf_event.h>
36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h>
38
39#include <asm/irq_regs.h>
40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
131static atomic_t nr_mmap_events __read_mostly;
132static atomic_t nr_comm_events __read_mostly;
133static atomic_t nr_task_events __read_mostly;
134
135static LIST_HEAD(pmus);
136static DEFINE_MUTEX(pmus_lock);
137static struct srcu_struct pmus_srcu;
138
139/*
140 * perf event paranoia level:
141 * -1 - not paranoid at all
142 * 0 - disallow raw tracepoint access for unpriv
143 * 1 - disallow cpu events for unpriv
144 * 2 - disallow kernel profiling for unpriv
145 */
146int sysctl_perf_event_paranoid __read_mostly = 1;
147
148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
150
151/*
152 * max perf event sample rate
153 */
154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
172
173static atomic64_t perf_event_id;
174
175static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
176 enum event_type_t event_type);
177
178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
184
185void __weak perf_event_print_debug(void) { }
186
187extern __weak const char *perf_pmu_name(void)
188{
189 return "pmu";
190}
191
192static inline u64 perf_clock(void)
193{
194 return local_clock();
195}
196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
205/*
206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
209 */
210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
316
317/*
318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
322 */
323void perf_cgroup_switch(struct task_struct *task, int mode)
324{
325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
368 /* set cgrp before ctxsw in to
369 * allow event_filter_match() to not
370 * have to pass task around
371 */
372 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 }
375 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 }
379
380 rcu_read_unlock();
381
382 local_irq_restore(flags);
383}
384
385static inline void perf_cgroup_sched_out(struct task_struct *task)
386{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388}
389
390static inline void perf_cgroup_sched_in(struct task_struct *task)
391{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393}
394
395static inline int perf_cgroup_connect(int fd, struct perf_event *event,
396 struct perf_event_attr *attr,
397 struct perf_event *group_leader)
398{
399 struct perf_cgroup *cgrp;
400 struct cgroup_subsys_state *css;
401 struct file *file;
402 int ret = 0, fput_needed;
403
404 file = fget_light(fd, &fput_needed);
405 if (!file)
406 return -EBADF;
407
408 css = cgroup_css_from_dir(file, perf_subsys_id);
409 if (IS_ERR(css)) {
410 ret = PTR_ERR(css);
411 goto out;
412 }
413
414 cgrp = container_of(css, struct perf_cgroup, css);
415 event->cgrp = cgrp;
416
417 /* must be done before we fput() the file */
418 perf_get_cgroup(event);
419
420 /*
421 * all events in a group must monitor
422 * the same cgroup because a task belongs
423 * to only one perf cgroup at a time
424 */
425 if (group_leader && group_leader->cgrp != cgrp) {
426 perf_detach_cgroup(event);
427 ret = -EINVAL;
428 }
429out:
430 fput_light(file, fput_needed);
431 return ret;
432}
433
434static inline void
435perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
436{
437 struct perf_cgroup_info *t;
438 t = per_cpu_ptr(event->cgrp->info, event->cpu);
439 event->shadow_ctx_time = now - t->timestamp;
440}
441
442static inline void
443perf_cgroup_defer_enabled(struct perf_event *event)
444{
445 /*
446 * when the current task's perf cgroup does not match
447 * the event's, we need to remember to call the
448 * perf_mark_enable() function the first time a task with
449 * a matching perf cgroup is scheduled in.
450 */
451 if (is_cgroup_event(event) && !perf_cgroup_match(event))
452 event->cgrp_defer_enabled = 1;
453}
454
455static inline void
456perf_cgroup_mark_enabled(struct perf_event *event,
457 struct perf_event_context *ctx)
458{
459 struct perf_event *sub;
460 u64 tstamp = perf_event_time(event);
461
462 if (!event->cgrp_defer_enabled)
463 return;
464
465 event->cgrp_defer_enabled = 0;
466
467 event->tstamp_enabled = tstamp - event->total_time_enabled;
468 list_for_each_entry(sub, &event->sibling_list, group_entry) {
469 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
470 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
471 sub->cgrp_defer_enabled = 0;
472 }
473 }
474}
475#else /* !CONFIG_CGROUP_PERF */
476
477static inline bool
478perf_cgroup_match(struct perf_event *event)
479{
480 return true;
481}
482
483static inline void perf_detach_cgroup(struct perf_event *event)
484{}
485
486static inline int is_cgroup_event(struct perf_event *event)
487{
488 return 0;
489}
490
491static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
492{
493 return 0;
494}
495
496static inline void update_cgrp_time_from_event(struct perf_event *event)
497{
498}
499
500static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
501{
502}
503
504static inline void perf_cgroup_sched_out(struct task_struct *task)
505{
506}
507
508static inline void perf_cgroup_sched_in(struct task_struct *task)
509{
510}
511
512static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
513 struct perf_event_attr *attr,
514 struct perf_event *group_leader)
515{
516 return -EINVAL;
517}
518
519static inline void
520perf_cgroup_set_timestamp(struct task_struct *task,
521 struct perf_event_context *ctx)
522{
523}
524
525void
526perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
527{
528}
529
530static inline void
531perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
532{
533}
534
535static inline u64 perf_cgroup_event_time(struct perf_event *event)
536{
537 return 0;
538}
539
540static inline void
541perf_cgroup_defer_enabled(struct perf_event *event)
542{
543}
544
545static inline void
546perf_cgroup_mark_enabled(struct perf_event *event,
547 struct perf_event_context *ctx)
548{
549}
550#endif
551
552void perf_pmu_disable(struct pmu *pmu)
553{
554 int *count = this_cpu_ptr(pmu->pmu_disable_count);
555 if (!(*count)++)
556 pmu->pmu_disable(pmu);
557}
558
559void perf_pmu_enable(struct pmu *pmu)
560{
561 int *count = this_cpu_ptr(pmu->pmu_disable_count);
562 if (!--(*count))
563 pmu->pmu_enable(pmu);
564}
565
566static DEFINE_PER_CPU(struct list_head, rotation_list);
567
568/*
569 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
570 * because they're strictly cpu affine and rotate_start is called with IRQs
571 * disabled, while rotate_context is called from IRQ context.
572 */
573static void perf_pmu_rotate_start(struct pmu *pmu)
574{
575 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
576 struct list_head *head = &__get_cpu_var(rotation_list);
577
578 WARN_ON(!irqs_disabled());
579
580 if (list_empty(&cpuctx->rotation_list))
581 list_add(&cpuctx->rotation_list, head);
582}
583
584static void get_ctx(struct perf_event_context *ctx)
585{
586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
587}
588
589static void free_ctx(struct rcu_head *head)
590{
591 struct perf_event_context *ctx;
592
593 ctx = container_of(head, struct perf_event_context, rcu_head);
594 kfree(ctx);
595}
596
597static void put_ctx(struct perf_event_context *ctx)
598{
599 if (atomic_dec_and_test(&ctx->refcount)) {
600 if (ctx->parent_ctx)
601 put_ctx(ctx->parent_ctx);
602 if (ctx->task)
603 put_task_struct(ctx->task);
604 call_rcu(&ctx->rcu_head, free_ctx);
605 }
606}
607
608static void unclone_ctx(struct perf_event_context *ctx)
609{
610 if (ctx->parent_ctx) {
611 put_ctx(ctx->parent_ctx);
612 ctx->parent_ctx = NULL;
613 }
614}
615
616static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
617{
618 /*
619 * only top level events have the pid namespace they were created in
620 */
621 if (event->parent)
622 event = event->parent;
623
624 return task_tgid_nr_ns(p, event->ns);
625}
626
627static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
628{
629 /*
630 * only top level events have the pid namespace they were created in
631 */
632 if (event->parent)
633 event = event->parent;
634
635 return task_pid_nr_ns(p, event->ns);
636}
637
638/*
639 * If we inherit events we want to return the parent event id
640 * to userspace.
641 */
642static u64 primary_event_id(struct perf_event *event)
643{
644 u64 id = event->id;
645
646 if (event->parent)
647 id = event->parent->id;
648
649 return id;
650}
651
652/*
653 * Get the perf_event_context for a task and lock it.
654 * This has to cope with with the fact that until it is locked,
655 * the context could get moved to another task.
656 */
657static struct perf_event_context *
658perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
659{
660 struct perf_event_context *ctx;
661
662 rcu_read_lock();
663retry:
664 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
665 if (ctx) {
666 /*
667 * If this context is a clone of another, it might
668 * get swapped for another underneath us by
669 * perf_event_task_sched_out, though the
670 * rcu_read_lock() protects us from any context
671 * getting freed. Lock the context and check if it
672 * got swapped before we could get the lock, and retry
673 * if so. If we locked the right context, then it
674 * can't get swapped on us any more.
675 */
676 raw_spin_lock_irqsave(&ctx->lock, *flags);
677 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
678 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
679 goto retry;
680 }
681
682 if (!atomic_inc_not_zero(&ctx->refcount)) {
683 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
684 ctx = NULL;
685 }
686 }
687 rcu_read_unlock();
688 return ctx;
689}
690
691/*
692 * Get the context for a task and increment its pin_count so it
693 * can't get swapped to another task. This also increments its
694 * reference count so that the context can't get freed.
695 */
696static struct perf_event_context *
697perf_pin_task_context(struct task_struct *task, int ctxn)
698{
699 struct perf_event_context *ctx;
700 unsigned long flags;
701
702 ctx = perf_lock_task_context(task, ctxn, &flags);
703 if (ctx) {
704 ++ctx->pin_count;
705 raw_spin_unlock_irqrestore(&ctx->lock, flags);
706 }
707 return ctx;
708}
709
710static void perf_unpin_context(struct perf_event_context *ctx)
711{
712 unsigned long flags;
713
714 raw_spin_lock_irqsave(&ctx->lock, flags);
715 --ctx->pin_count;
716 raw_spin_unlock_irqrestore(&ctx->lock, flags);
717}
718
719/*
720 * Update the record of the current time in a context.
721 */
722static void update_context_time(struct perf_event_context *ctx)
723{
724 u64 now = perf_clock();
725
726 ctx->time += now - ctx->timestamp;
727 ctx->timestamp = now;
728}
729
730static u64 perf_event_time(struct perf_event *event)
731{
732 struct perf_event_context *ctx = event->ctx;
733
734 if (is_cgroup_event(event))
735 return perf_cgroup_event_time(event);
736
737 return ctx ? ctx->time : 0;
738}
739
740/*
741 * Update the total_time_enabled and total_time_running fields for a event.
742 */
743static void update_event_times(struct perf_event *event)
744{
745 struct perf_event_context *ctx = event->ctx;
746 u64 run_end;
747
748 if (event->state < PERF_EVENT_STATE_INACTIVE ||
749 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
750 return;
751 /*
752 * in cgroup mode, time_enabled represents
753 * the time the event was enabled AND active
754 * tasks were in the monitored cgroup. This is
755 * independent of the activity of the context as
756 * there may be a mix of cgroup and non-cgroup events.
757 *
758 * That is why we treat cgroup events differently
759 * here.
760 */
761 if (is_cgroup_event(event))
762 run_end = perf_event_time(event);
763 else if (ctx->is_active)
764 run_end = ctx->time;
765 else
766 run_end = event->tstamp_stopped;
767
768 event->total_time_enabled = run_end - event->tstamp_enabled;
769
770 if (event->state == PERF_EVENT_STATE_INACTIVE)
771 run_end = event->tstamp_stopped;
772 else
773 run_end = perf_event_time(event);
774
775 event->total_time_running = run_end - event->tstamp_running;
776
777}
778
779/*
780 * Update total_time_enabled and total_time_running for all events in a group.
781 */
782static void update_group_times(struct perf_event *leader)
783{
784 struct perf_event *event;
785
786 update_event_times(leader);
787 list_for_each_entry(event, &leader->sibling_list, group_entry)
788 update_event_times(event);
789}
790
791static struct list_head *
792ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
793{
794 if (event->attr.pinned)
795 return &ctx->pinned_groups;
796 else
797 return &ctx->flexible_groups;
798}
799
800/*
801 * Add a event from the lists for its context.
802 * Must be called with ctx->mutex and ctx->lock held.
803 */
804static void
805list_add_event(struct perf_event *event, struct perf_event_context *ctx)
806{
807 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
808 event->attach_state |= PERF_ATTACH_CONTEXT;
809
810 /*
811 * If we're a stand alone event or group leader, we go to the context
812 * list, group events are kept attached to the group so that
813 * perf_group_detach can, at all times, locate all siblings.
814 */
815 if (event->group_leader == event) {
816 struct list_head *list;
817
818 if (is_software_event(event))
819 event->group_flags |= PERF_GROUP_SOFTWARE;
820
821 list = ctx_group_list(event, ctx);
822 list_add_tail(&event->group_entry, list);
823 }
824
825 if (is_cgroup_event(event))
826 ctx->nr_cgroups++;
827
828 list_add_rcu(&event->event_entry, &ctx->event_list);
829 if (!ctx->nr_events)
830 perf_pmu_rotate_start(ctx->pmu);
831 ctx->nr_events++;
832 if (event->attr.inherit_stat)
833 ctx->nr_stat++;
834}
835
836/*
837 * Called at perf_event creation and when events are attached/detached from a
838 * group.
839 */
840static void perf_event__read_size(struct perf_event *event)
841{
842 int entry = sizeof(u64); /* value */
843 int size = 0;
844 int nr = 1;
845
846 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
847 size += sizeof(u64);
848
849 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
850 size += sizeof(u64);
851
852 if (event->attr.read_format & PERF_FORMAT_ID)
853 entry += sizeof(u64);
854
855 if (event->attr.read_format & PERF_FORMAT_GROUP) {
856 nr += event->group_leader->nr_siblings;
857 size += sizeof(u64);
858 }
859
860 size += entry * nr;
861 event->read_size = size;
862}
863
864static void perf_event__header_size(struct perf_event *event)
865{
866 struct perf_sample_data *data;
867 u64 sample_type = event->attr.sample_type;
868 u16 size = 0;
869
870 perf_event__read_size(event);
871
872 if (sample_type & PERF_SAMPLE_IP)
873 size += sizeof(data->ip);
874
875 if (sample_type & PERF_SAMPLE_ADDR)
876 size += sizeof(data->addr);
877
878 if (sample_type & PERF_SAMPLE_PERIOD)
879 size += sizeof(data->period);
880
881 if (sample_type & PERF_SAMPLE_READ)
882 size += event->read_size;
883
884 event->header_size = size;
885}
886
887static void perf_event__id_header_size(struct perf_event *event)
888{
889 struct perf_sample_data *data;
890 u64 sample_type = event->attr.sample_type;
891 u16 size = 0;
892
893 if (sample_type & PERF_SAMPLE_TID)
894 size += sizeof(data->tid_entry);
895
896 if (sample_type & PERF_SAMPLE_TIME)
897 size += sizeof(data->time);
898
899 if (sample_type & PERF_SAMPLE_ID)
900 size += sizeof(data->id);
901
902 if (sample_type & PERF_SAMPLE_STREAM_ID)
903 size += sizeof(data->stream_id);
904
905 if (sample_type & PERF_SAMPLE_CPU)
906 size += sizeof(data->cpu_entry);
907
908 event->id_header_size = size;
909}
910
911static void perf_group_attach(struct perf_event *event)
912{
913 struct perf_event *group_leader = event->group_leader, *pos;
914
915 /*
916 * We can have double attach due to group movement in perf_event_open.
917 */
918 if (event->attach_state & PERF_ATTACH_GROUP)
919 return;
920
921 event->attach_state |= PERF_ATTACH_GROUP;
922
923 if (group_leader == event)
924 return;
925
926 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
927 !is_software_event(event))
928 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
929
930 list_add_tail(&event->group_entry, &group_leader->sibling_list);
931 group_leader->nr_siblings++;
932
933 perf_event__header_size(group_leader);
934
935 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
936 perf_event__header_size(pos);
937}
938
939/*
940 * Remove a event from the lists for its context.
941 * Must be called with ctx->mutex and ctx->lock held.
942 */
943static void
944list_del_event(struct perf_event *event, struct perf_event_context *ctx)
945{
946 struct perf_cpu_context *cpuctx;
947 /*
948 * We can have double detach due to exit/hot-unplug + close.
949 */
950 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
951 return;
952
953 event->attach_state &= ~PERF_ATTACH_CONTEXT;
954
955 if (is_cgroup_event(event)) {
956 ctx->nr_cgroups--;
957 cpuctx = __get_cpu_context(ctx);
958 /*
959 * if there are no more cgroup events
960 * then cler cgrp to avoid stale pointer
961 * in update_cgrp_time_from_cpuctx()
962 */
963 if (!ctx->nr_cgroups)
964 cpuctx->cgrp = NULL;
965 }
966
967 ctx->nr_events--;
968 if (event->attr.inherit_stat)
969 ctx->nr_stat--;
970
971 list_del_rcu(&event->event_entry);
972
973 if (event->group_leader == event)
974 list_del_init(&event->group_entry);
975
976 update_group_times(event);
977
978 /*
979 * If event was in error state, then keep it
980 * that way, otherwise bogus counts will be
981 * returned on read(). The only way to get out
982 * of error state is by explicit re-enabling
983 * of the event
984 */
985 if (event->state > PERF_EVENT_STATE_OFF)
986 event->state = PERF_EVENT_STATE_OFF;
987}
988
989static void perf_group_detach(struct perf_event *event)
990{
991 struct perf_event *sibling, *tmp;
992 struct list_head *list = NULL;
993
994 /*
995 * We can have double detach due to exit/hot-unplug + close.
996 */
997 if (!(event->attach_state & PERF_ATTACH_GROUP))
998 return;
999
1000 event->attach_state &= ~PERF_ATTACH_GROUP;
1001
1002 /*
1003 * If this is a sibling, remove it from its group.
1004 */
1005 if (event->group_leader != event) {
1006 list_del_init(&event->group_entry);
1007 event->group_leader->nr_siblings--;
1008 goto out;
1009 }
1010
1011 if (!list_empty(&event->group_entry))
1012 list = &event->group_entry;
1013
1014 /*
1015 * If this was a group event with sibling events then
1016 * upgrade the siblings to singleton events by adding them
1017 * to whatever list we are on.
1018 */
1019 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1020 if (list)
1021 list_move_tail(&sibling->group_entry, list);
1022 sibling->group_leader = sibling;
1023
1024 /* Inherit group flags from the previous leader */
1025 sibling->group_flags = event->group_flags;
1026 }
1027
1028out:
1029 perf_event__header_size(event->group_leader);
1030
1031 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1032 perf_event__header_size(tmp);
1033}
1034
1035static inline int
1036event_filter_match(struct perf_event *event)
1037{
1038 return (event->cpu == -1 || event->cpu == smp_processor_id())
1039 && perf_cgroup_match(event);
1040}
1041
1042static void
1043event_sched_out(struct perf_event *event,
1044 struct perf_cpu_context *cpuctx,
1045 struct perf_event_context *ctx)
1046{
1047 u64 tstamp = perf_event_time(event);
1048 u64 delta;
1049 /*
1050 * An event which could not be activated because of
1051 * filter mismatch still needs to have its timings
1052 * maintained, otherwise bogus information is return
1053 * via read() for time_enabled, time_running:
1054 */
1055 if (event->state == PERF_EVENT_STATE_INACTIVE
1056 && !event_filter_match(event)) {
1057 delta = tstamp - event->tstamp_stopped;
1058 event->tstamp_running += delta;
1059 event->tstamp_stopped = tstamp;
1060 }
1061
1062 if (event->state != PERF_EVENT_STATE_ACTIVE)
1063 return;
1064
1065 event->state = PERF_EVENT_STATE_INACTIVE;
1066 if (event->pending_disable) {
1067 event->pending_disable = 0;
1068 event->state = PERF_EVENT_STATE_OFF;
1069 }
1070 event->tstamp_stopped = tstamp;
1071 event->pmu->del(event, 0);
1072 event->oncpu = -1;
1073
1074 if (!is_software_event(event))
1075 cpuctx->active_oncpu--;
1076 ctx->nr_active--;
1077 if (event->attr.exclusive || !cpuctx->active_oncpu)
1078 cpuctx->exclusive = 0;
1079}
1080
1081static void
1082group_sched_out(struct perf_event *group_event,
1083 struct perf_cpu_context *cpuctx,
1084 struct perf_event_context *ctx)
1085{
1086 struct perf_event *event;
1087 int state = group_event->state;
1088
1089 event_sched_out(group_event, cpuctx, ctx);
1090
1091 /*
1092 * Schedule out siblings (if any):
1093 */
1094 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1095 event_sched_out(event, cpuctx, ctx);
1096
1097 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1098 cpuctx->exclusive = 0;
1099}
1100
1101/*
1102 * Cross CPU call to remove a performance event
1103 *
1104 * We disable the event on the hardware level first. After that we
1105 * remove it from the context list.
1106 */
1107static int __perf_remove_from_context(void *info)
1108{
1109 struct perf_event *event = info;
1110 struct perf_event_context *ctx = event->ctx;
1111 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1112
1113 raw_spin_lock(&ctx->lock);
1114 event_sched_out(event, cpuctx, ctx);
1115 list_del_event(event, ctx);
1116 raw_spin_unlock(&ctx->lock);
1117
1118 return 0;
1119}
1120
1121
1122/*
1123 * Remove the event from a task's (or a CPU's) list of events.
1124 *
1125 * CPU events are removed with a smp call. For task events we only
1126 * call when the task is on a CPU.
1127 *
1128 * If event->ctx is a cloned context, callers must make sure that
1129 * every task struct that event->ctx->task could possibly point to
1130 * remains valid. This is OK when called from perf_release since
1131 * that only calls us on the top-level context, which can't be a clone.
1132 * When called from perf_event_exit_task, it's OK because the
1133 * context has been detached from its task.
1134 */
1135static void perf_remove_from_context(struct perf_event *event)
1136{
1137 struct perf_event_context *ctx = event->ctx;
1138 struct task_struct *task = ctx->task;
1139
1140 lockdep_assert_held(&ctx->mutex);
1141
1142 if (!task) {
1143 /*
1144 * Per cpu events are removed via an smp call and
1145 * the removal is always successful.
1146 */
1147 cpu_function_call(event->cpu, __perf_remove_from_context, event);
1148 return;
1149 }
1150
1151retry:
1152 if (!task_function_call(task, __perf_remove_from_context, event))
1153 return;
1154
1155 raw_spin_lock_irq(&ctx->lock);
1156 /*
1157 * If we failed to find a running task, but find the context active now
1158 * that we've acquired the ctx->lock, retry.
1159 */
1160 if (ctx->is_active) {
1161 raw_spin_unlock_irq(&ctx->lock);
1162 goto retry;
1163 }
1164
1165 /*
1166 * Since the task isn't running, its safe to remove the event, us
1167 * holding the ctx->lock ensures the task won't get scheduled in.
1168 */
1169 list_del_event(event, ctx);
1170 raw_spin_unlock_irq(&ctx->lock);
1171}
1172
1173/*
1174 * Cross CPU call to disable a performance event
1175 */
1176static int __perf_event_disable(void *info)
1177{
1178 struct perf_event *event = info;
1179 struct perf_event_context *ctx = event->ctx;
1180 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1181
1182 /*
1183 * If this is a per-task event, need to check whether this
1184 * event's task is the current task on this cpu.
1185 *
1186 * Can trigger due to concurrent perf_event_context_sched_out()
1187 * flipping contexts around.
1188 */
1189 if (ctx->task && cpuctx->task_ctx != ctx)
1190 return -EINVAL;
1191
1192 raw_spin_lock(&ctx->lock);
1193
1194 /*
1195 * If the event is on, turn it off.
1196 * If it is in error state, leave it in error state.
1197 */
1198 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1199 update_context_time(ctx);
1200 update_cgrp_time_from_event(event);
1201 update_group_times(event);
1202 if (event == event->group_leader)
1203 group_sched_out(event, cpuctx, ctx);
1204 else
1205 event_sched_out(event, cpuctx, ctx);
1206 event->state = PERF_EVENT_STATE_OFF;
1207 }
1208
1209 raw_spin_unlock(&ctx->lock);
1210
1211 return 0;
1212}
1213
1214/*
1215 * Disable a event.
1216 *
1217 * If event->ctx is a cloned context, callers must make sure that
1218 * every task struct that event->ctx->task could possibly point to
1219 * remains valid. This condition is satisifed when called through
1220 * perf_event_for_each_child or perf_event_for_each because they
1221 * hold the top-level event's child_mutex, so any descendant that
1222 * goes to exit will block in sync_child_event.
1223 * When called from perf_pending_event it's OK because event->ctx
1224 * is the current context on this CPU and preemption is disabled,
1225 * hence we can't get into perf_event_task_sched_out for this context.
1226 */
1227void perf_event_disable(struct perf_event *event)
1228{
1229 struct perf_event_context *ctx = event->ctx;
1230 struct task_struct *task = ctx->task;
1231
1232 if (!task) {
1233 /*
1234 * Disable the event on the cpu that it's on
1235 */
1236 cpu_function_call(event->cpu, __perf_event_disable, event);
1237 return;
1238 }
1239
1240retry:
1241 if (!task_function_call(task, __perf_event_disable, event))
1242 return;
1243
1244 raw_spin_lock_irq(&ctx->lock);
1245 /*
1246 * If the event is still active, we need to retry the cross-call.
1247 */
1248 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1249 raw_spin_unlock_irq(&ctx->lock);
1250 /*
1251 * Reload the task pointer, it might have been changed by
1252 * a concurrent perf_event_context_sched_out().
1253 */
1254 task = ctx->task;
1255 goto retry;
1256 }
1257
1258 /*
1259 * Since we have the lock this context can't be scheduled
1260 * in, so we can change the state safely.
1261 */
1262 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1263 update_group_times(event);
1264 event->state = PERF_EVENT_STATE_OFF;
1265 }
1266 raw_spin_unlock_irq(&ctx->lock);
1267}
1268
1269static void perf_set_shadow_time(struct perf_event *event,
1270 struct perf_event_context *ctx,
1271 u64 tstamp)
1272{
1273 /*
1274 * use the correct time source for the time snapshot
1275 *
1276 * We could get by without this by leveraging the
1277 * fact that to get to this function, the caller
1278 * has most likely already called update_context_time()
1279 * and update_cgrp_time_xx() and thus both timestamp
1280 * are identical (or very close). Given that tstamp is,
1281 * already adjusted for cgroup, we could say that:
1282 * tstamp - ctx->timestamp
1283 * is equivalent to
1284 * tstamp - cgrp->timestamp.
1285 *
1286 * Then, in perf_output_read(), the calculation would
1287 * work with no changes because:
1288 * - event is guaranteed scheduled in
1289 * - no scheduled out in between
1290 * - thus the timestamp would be the same
1291 *
1292 * But this is a bit hairy.
1293 *
1294 * So instead, we have an explicit cgroup call to remain
1295 * within the time time source all along. We believe it
1296 * is cleaner and simpler to understand.
1297 */
1298 if (is_cgroup_event(event))
1299 perf_cgroup_set_shadow_time(event, tstamp);
1300 else
1301 event->shadow_ctx_time = tstamp - ctx->timestamp;
1302}
1303
1304#define MAX_INTERRUPTS (~0ULL)
1305
1306static void perf_log_throttle(struct perf_event *event, int enable);
1307
1308static int
1309event_sched_in(struct perf_event *event,
1310 struct perf_cpu_context *cpuctx,
1311 struct perf_event_context *ctx)
1312{
1313 u64 tstamp = perf_event_time(event);
1314
1315 if (event->state <= PERF_EVENT_STATE_OFF)
1316 return 0;
1317
1318 event->state = PERF_EVENT_STATE_ACTIVE;
1319 event->oncpu = smp_processor_id();
1320
1321 /*
1322 * Unthrottle events, since we scheduled we might have missed several
1323 * ticks already, also for a heavily scheduling task there is little
1324 * guarantee it'll get a tick in a timely manner.
1325 */
1326 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1327 perf_log_throttle(event, 1);
1328 event->hw.interrupts = 0;
1329 }
1330
1331 /*
1332 * The new state must be visible before we turn it on in the hardware:
1333 */
1334 smp_wmb();
1335
1336 if (event->pmu->add(event, PERF_EF_START)) {
1337 event->state = PERF_EVENT_STATE_INACTIVE;
1338 event->oncpu = -1;
1339 return -EAGAIN;
1340 }
1341
1342 event->tstamp_running += tstamp - event->tstamp_stopped;
1343
1344 perf_set_shadow_time(event, ctx, tstamp);
1345
1346 if (!is_software_event(event))
1347 cpuctx->active_oncpu++;
1348 ctx->nr_active++;
1349
1350 if (event->attr.exclusive)
1351 cpuctx->exclusive = 1;
1352
1353 return 0;
1354}
1355
1356static int
1357group_sched_in(struct perf_event *group_event,
1358 struct perf_cpu_context *cpuctx,
1359 struct perf_event_context *ctx)
1360{
1361 struct perf_event *event, *partial_group = NULL;
1362 struct pmu *pmu = group_event->pmu;
1363 u64 now = ctx->time;
1364 bool simulate = false;
1365
1366 if (group_event->state == PERF_EVENT_STATE_OFF)
1367 return 0;
1368
1369 pmu->start_txn(pmu);
1370
1371 if (event_sched_in(group_event, cpuctx, ctx)) {
1372 pmu->cancel_txn(pmu);
1373 return -EAGAIN;
1374 }
1375
1376 /*
1377 * Schedule in siblings as one group (if any):
1378 */
1379 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1380 if (event_sched_in(event, cpuctx, ctx)) {
1381 partial_group = event;
1382 goto group_error;
1383 }
1384 }
1385
1386 if (!pmu->commit_txn(pmu))
1387 return 0;
1388
1389group_error:
1390 /*
1391 * Groups can be scheduled in as one unit only, so undo any
1392 * partial group before returning:
1393 * The events up to the failed event are scheduled out normally,
1394 * tstamp_stopped will be updated.
1395 *
1396 * The failed events and the remaining siblings need to have
1397 * their timings updated as if they had gone thru event_sched_in()
1398 * and event_sched_out(). This is required to get consistent timings
1399 * across the group. This also takes care of the case where the group
1400 * could never be scheduled by ensuring tstamp_stopped is set to mark
1401 * the time the event was actually stopped, such that time delta
1402 * calculation in update_event_times() is correct.
1403 */
1404 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1405 if (event == partial_group)
1406 simulate = true;
1407
1408 if (simulate) {
1409 event->tstamp_running += now - event->tstamp_stopped;
1410 event->tstamp_stopped = now;
1411 } else {
1412 event_sched_out(event, cpuctx, ctx);
1413 }
1414 }
1415 event_sched_out(group_event, cpuctx, ctx);
1416
1417 pmu->cancel_txn(pmu);
1418
1419 return -EAGAIN;
1420}
1421
1422/*
1423 * Work out whether we can put this event group on the CPU now.
1424 */
1425static int group_can_go_on(struct perf_event *event,
1426 struct perf_cpu_context *cpuctx,
1427 int can_add_hw)
1428{
1429 /*
1430 * Groups consisting entirely of software events can always go on.
1431 */
1432 if (event->group_flags & PERF_GROUP_SOFTWARE)
1433 return 1;
1434 /*
1435 * If an exclusive group is already on, no other hardware
1436 * events can go on.
1437 */
1438 if (cpuctx->exclusive)
1439 return 0;
1440 /*
1441 * If this group is exclusive and there are already
1442 * events on the CPU, it can't go on.
1443 */
1444 if (event->attr.exclusive && cpuctx->active_oncpu)
1445 return 0;
1446 /*
1447 * Otherwise, try to add it if all previous groups were able
1448 * to go on.
1449 */
1450 return can_add_hw;
1451}
1452
1453static void add_event_to_ctx(struct perf_event *event,
1454 struct perf_event_context *ctx)
1455{
1456 u64 tstamp = perf_event_time(event);
1457
1458 list_add_event(event, ctx);
1459 perf_group_attach(event);
1460 event->tstamp_enabled = tstamp;
1461 event->tstamp_running = tstamp;
1462 event->tstamp_stopped = tstamp;
1463}
1464
1465static void perf_event_context_sched_in(struct perf_event_context *ctx,
1466 struct task_struct *tsk);
1467
1468/*
1469 * Cross CPU call to install and enable a performance event
1470 *
1471 * Must be called with ctx->mutex held
1472 */
1473static int __perf_install_in_context(void *info)
1474{
1475 struct perf_event *event = info;
1476 struct perf_event_context *ctx = event->ctx;
1477 struct perf_event *leader = event->group_leader;
1478 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1479 int err;
1480
1481 /*
1482 * In case we're installing a new context to an already running task,
1483 * could also happen before perf_event_task_sched_in() on architectures
1484 * which do context switches with IRQs enabled.
1485 */
1486 if (ctx->task && !cpuctx->task_ctx)
1487 perf_event_context_sched_in(ctx, ctx->task);
1488
1489 raw_spin_lock(&ctx->lock);
1490 ctx->is_active = 1;
1491 update_context_time(ctx);
1492 /*
1493 * update cgrp time only if current cgrp
1494 * matches event->cgrp. Must be done before
1495 * calling add_event_to_ctx()
1496 */
1497 update_cgrp_time_from_event(event);
1498
1499 add_event_to_ctx(event, ctx);
1500
1501 if (!event_filter_match(event))
1502 goto unlock;
1503
1504 /*
1505 * Don't put the event on if it is disabled or if
1506 * it is in a group and the group isn't on.
1507 */
1508 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1509 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1510 goto unlock;
1511
1512 /*
1513 * An exclusive event can't go on if there are already active
1514 * hardware events, and no hardware event can go on if there
1515 * is already an exclusive event on.
1516 */
1517 if (!group_can_go_on(event, cpuctx, 1))
1518 err = -EEXIST;
1519 else
1520 err = event_sched_in(event, cpuctx, ctx);
1521
1522 if (err) {
1523 /*
1524 * This event couldn't go on. If it is in a group
1525 * then we have to pull the whole group off.
1526 * If the event group is pinned then put it in error state.
1527 */
1528 if (leader != event)
1529 group_sched_out(leader, cpuctx, ctx);
1530 if (leader->attr.pinned) {
1531 update_group_times(leader);
1532 leader->state = PERF_EVENT_STATE_ERROR;
1533 }
1534 }
1535
1536unlock:
1537 raw_spin_unlock(&ctx->lock);
1538
1539 return 0;
1540}
1541
1542/*
1543 * Attach a performance event to a context
1544 *
1545 * First we add the event to the list with the hardware enable bit
1546 * in event->hw_config cleared.
1547 *
1548 * If the event is attached to a task which is on a CPU we use a smp
1549 * call to enable it in the task context. The task might have been
1550 * scheduled away, but we check this in the smp call again.
1551 */
1552static void
1553perf_install_in_context(struct perf_event_context *ctx,
1554 struct perf_event *event,
1555 int cpu)
1556{
1557 struct task_struct *task = ctx->task;
1558
1559 lockdep_assert_held(&ctx->mutex);
1560
1561 event->ctx = ctx;
1562
1563 if (!task) {
1564 /*
1565 * Per cpu events are installed via an smp call and
1566 * the install is always successful.
1567 */
1568 cpu_function_call(cpu, __perf_install_in_context, event);
1569 return;
1570 }
1571
1572retry:
1573 if (!task_function_call(task, __perf_install_in_context, event))
1574 return;
1575
1576 raw_spin_lock_irq(&ctx->lock);
1577 /*
1578 * If we failed to find a running task, but find the context active now
1579 * that we've acquired the ctx->lock, retry.
1580 */
1581 if (ctx->is_active) {
1582 raw_spin_unlock_irq(&ctx->lock);
1583 goto retry;
1584 }
1585
1586 /*
1587 * Since the task isn't running, its safe to add the event, us holding
1588 * the ctx->lock ensures the task won't get scheduled in.
1589 */
1590 add_event_to_ctx(event, ctx);
1591 raw_spin_unlock_irq(&ctx->lock);
1592}
1593
1594/*
1595 * Put a event into inactive state and update time fields.
1596 * Enabling the leader of a group effectively enables all
1597 * the group members that aren't explicitly disabled, so we
1598 * have to update their ->tstamp_enabled also.
1599 * Note: this works for group members as well as group leaders
1600 * since the non-leader members' sibling_lists will be empty.
1601 */
1602static void __perf_event_mark_enabled(struct perf_event *event,
1603 struct perf_event_context *ctx)
1604{
1605 struct perf_event *sub;
1606 u64 tstamp = perf_event_time(event);
1607
1608 event->state = PERF_EVENT_STATE_INACTIVE;
1609 event->tstamp_enabled = tstamp - event->total_time_enabled;
1610 list_for_each_entry(sub, &event->sibling_list, group_entry) {
1611 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1612 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1613 }
1614}
1615
1616/*
1617 * Cross CPU call to enable a performance event
1618 */
1619static int __perf_event_enable(void *info)
1620{
1621 struct perf_event *event = info;
1622 struct perf_event_context *ctx = event->ctx;
1623 struct perf_event *leader = event->group_leader;
1624 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1625 int err;
1626
1627 if (WARN_ON_ONCE(!ctx->is_active))
1628 return -EINVAL;
1629
1630 raw_spin_lock(&ctx->lock);
1631 update_context_time(ctx);
1632
1633 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1634 goto unlock;
1635
1636 /*
1637 * set current task's cgroup time reference point
1638 */
1639 perf_cgroup_set_timestamp(current, ctx);
1640
1641 __perf_event_mark_enabled(event, ctx);
1642
1643 if (!event_filter_match(event)) {
1644 if (is_cgroup_event(event))
1645 perf_cgroup_defer_enabled(event);
1646 goto unlock;
1647 }
1648
1649 /*
1650 * If the event is in a group and isn't the group leader,
1651 * then don't put it on unless the group is on.
1652 */
1653 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1654 goto unlock;
1655
1656 if (!group_can_go_on(event, cpuctx, 1)) {
1657 err = -EEXIST;
1658 } else {
1659 if (event == leader)
1660 err = group_sched_in(event, cpuctx, ctx);
1661 else
1662 err = event_sched_in(event, cpuctx, ctx);
1663 }
1664
1665 if (err) {
1666 /*
1667 * If this event can't go on and it's part of a
1668 * group, then the whole group has to come off.
1669 */
1670 if (leader != event)
1671 group_sched_out(leader, cpuctx, ctx);
1672 if (leader->attr.pinned) {
1673 update_group_times(leader);
1674 leader->state = PERF_EVENT_STATE_ERROR;
1675 }
1676 }
1677
1678unlock:
1679 raw_spin_unlock(&ctx->lock);
1680
1681 return 0;
1682}
1683
1684/*
1685 * Enable a event.
1686 *
1687 * If event->ctx is a cloned context, callers must make sure that
1688 * every task struct that event->ctx->task could possibly point to
1689 * remains valid. This condition is satisfied when called through
1690 * perf_event_for_each_child or perf_event_for_each as described
1691 * for perf_event_disable.
1692 */
1693void perf_event_enable(struct perf_event *event)
1694{
1695 struct perf_event_context *ctx = event->ctx;
1696 struct task_struct *task = ctx->task;
1697
1698 if (!task) {
1699 /*
1700 * Enable the event on the cpu that it's on
1701 */
1702 cpu_function_call(event->cpu, __perf_event_enable, event);
1703 return;
1704 }
1705
1706 raw_spin_lock_irq(&ctx->lock);
1707 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1708 goto out;
1709
1710 /*
1711 * If the event is in error state, clear that first.
1712 * That way, if we see the event in error state below, we
1713 * know that it has gone back into error state, as distinct
1714 * from the task having been scheduled away before the
1715 * cross-call arrived.
1716 */
1717 if (event->state == PERF_EVENT_STATE_ERROR)
1718 event->state = PERF_EVENT_STATE_OFF;
1719
1720retry:
1721 if (!ctx->is_active) {
1722 __perf_event_mark_enabled(event, ctx);
1723 goto out;
1724 }
1725
1726 raw_spin_unlock_irq(&ctx->lock);
1727
1728 if (!task_function_call(task, __perf_event_enable, event))
1729 return;
1730
1731 raw_spin_lock_irq(&ctx->lock);
1732
1733 /*
1734 * If the context is active and the event is still off,
1735 * we need to retry the cross-call.
1736 */
1737 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1738 /*
1739 * task could have been flipped by a concurrent
1740 * perf_event_context_sched_out()
1741 */
1742 task = ctx->task;
1743 goto retry;
1744 }
1745
1746out:
1747 raw_spin_unlock_irq(&ctx->lock);
1748}
1749
1750static int perf_event_refresh(struct perf_event *event, int refresh)
1751{
1752 /*
1753 * not supported on inherited events
1754 */
1755 if (event->attr.inherit || !is_sampling_event(event))
1756 return -EINVAL;
1757
1758 atomic_add(refresh, &event->event_limit);
1759 perf_event_enable(event);
1760
1761 return 0;
1762}
1763
1764static void ctx_sched_out(struct perf_event_context *ctx,
1765 struct perf_cpu_context *cpuctx,
1766 enum event_type_t event_type)
1767{
1768 struct perf_event *event;
1769
1770 raw_spin_lock(&ctx->lock);
1771 perf_pmu_disable(ctx->pmu);
1772 ctx->is_active = 0;
1773 if (likely(!ctx->nr_events))
1774 goto out;
1775 update_context_time(ctx);
1776 update_cgrp_time_from_cpuctx(cpuctx);
1777
1778 if (!ctx->nr_active)
1779 goto out;
1780
1781 if (event_type & EVENT_PINNED) {
1782 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1783 group_sched_out(event, cpuctx, ctx);
1784 }
1785
1786 if (event_type & EVENT_FLEXIBLE) {
1787 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1788 group_sched_out(event, cpuctx, ctx);
1789 }
1790out:
1791 perf_pmu_enable(ctx->pmu);
1792 raw_spin_unlock(&ctx->lock);
1793}
1794
1795/*
1796 * Test whether two contexts are equivalent, i.e. whether they
1797 * have both been cloned from the same version of the same context
1798 * and they both have the same number of enabled events.
1799 * If the number of enabled events is the same, then the set
1800 * of enabled events should be the same, because these are both
1801 * inherited contexts, therefore we can't access individual events
1802 * in them directly with an fd; we can only enable/disable all
1803 * events via prctl, or enable/disable all events in a family
1804 * via ioctl, which will have the same effect on both contexts.
1805 */
1806static int context_equiv(struct perf_event_context *ctx1,
1807 struct perf_event_context *ctx2)
1808{
1809 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1810 && ctx1->parent_gen == ctx2->parent_gen
1811 && !ctx1->pin_count && !ctx2->pin_count;
1812}
1813
1814static void __perf_event_sync_stat(struct perf_event *event,
1815 struct perf_event *next_event)
1816{
1817 u64 value;
1818
1819 if (!event->attr.inherit_stat)
1820 return;
1821
1822 /*
1823 * Update the event value, we cannot use perf_event_read()
1824 * because we're in the middle of a context switch and have IRQs
1825 * disabled, which upsets smp_call_function_single(), however
1826 * we know the event must be on the current CPU, therefore we
1827 * don't need to use it.
1828 */
1829 switch (event->state) {
1830 case PERF_EVENT_STATE_ACTIVE:
1831 event->pmu->read(event);
1832 /* fall-through */
1833
1834 case PERF_EVENT_STATE_INACTIVE:
1835 update_event_times(event);
1836 break;
1837
1838 default:
1839 break;
1840 }
1841
1842 /*
1843 * In order to keep per-task stats reliable we need to flip the event
1844 * values when we flip the contexts.
1845 */
1846 value = local64_read(&next_event->count);
1847 value = local64_xchg(&event->count, value);
1848 local64_set(&next_event->count, value);
1849
1850 swap(event->total_time_enabled, next_event->total_time_enabled);
1851 swap(event->total_time_running, next_event->total_time_running);
1852
1853 /*
1854 * Since we swizzled the values, update the user visible data too.
1855 */
1856 perf_event_update_userpage(event);
1857 perf_event_update_userpage(next_event);
1858}
1859
1860#define list_next_entry(pos, member) \
1861 list_entry(pos->member.next, typeof(*pos), member)
1862
1863static void perf_event_sync_stat(struct perf_event_context *ctx,
1864 struct perf_event_context *next_ctx)
1865{
1866 struct perf_event *event, *next_event;
1867
1868 if (!ctx->nr_stat)
1869 return;
1870
1871 update_context_time(ctx);
1872
1873 event = list_first_entry(&ctx->event_list,
1874 struct perf_event, event_entry);
1875
1876 next_event = list_first_entry(&next_ctx->event_list,
1877 struct perf_event, event_entry);
1878
1879 while (&event->event_entry != &ctx->event_list &&
1880 &next_event->event_entry != &next_ctx->event_list) {
1881
1882 __perf_event_sync_stat(event, next_event);
1883
1884 event = list_next_entry(event, event_entry);
1885 next_event = list_next_entry(next_event, event_entry);
1886 }
1887}
1888
1889static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1890 struct task_struct *next)
1891{
1892 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1893 struct perf_event_context *next_ctx;
1894 struct perf_event_context *parent;
1895 struct perf_cpu_context *cpuctx;
1896 int do_switch = 1;
1897
1898 if (likely(!ctx))
1899 return;
1900
1901 cpuctx = __get_cpu_context(ctx);
1902 if (!cpuctx->task_ctx)
1903 return;
1904
1905 rcu_read_lock();
1906 parent = rcu_dereference(ctx->parent_ctx);
1907 next_ctx = next->perf_event_ctxp[ctxn];
1908 if (parent && next_ctx &&
1909 rcu_dereference(next_ctx->parent_ctx) == parent) {
1910 /*
1911 * Looks like the two contexts are clones, so we might be
1912 * able to optimize the context switch. We lock both
1913 * contexts and check that they are clones under the
1914 * lock (including re-checking that neither has been
1915 * uncloned in the meantime). It doesn't matter which
1916 * order we take the locks because no other cpu could
1917 * be trying to lock both of these tasks.
1918 */
1919 raw_spin_lock(&ctx->lock);
1920 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1921 if (context_equiv(ctx, next_ctx)) {
1922 /*
1923 * XXX do we need a memory barrier of sorts
1924 * wrt to rcu_dereference() of perf_event_ctxp
1925 */
1926 task->perf_event_ctxp[ctxn] = next_ctx;
1927 next->perf_event_ctxp[ctxn] = ctx;
1928 ctx->task = next;
1929 next_ctx->task = task;
1930 do_switch = 0;
1931
1932 perf_event_sync_stat(ctx, next_ctx);
1933 }
1934 raw_spin_unlock(&next_ctx->lock);
1935 raw_spin_unlock(&ctx->lock);
1936 }
1937 rcu_read_unlock();
1938
1939 if (do_switch) {
1940 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1941 cpuctx->task_ctx = NULL;
1942 }
1943}
1944
1945#define for_each_task_context_nr(ctxn) \
1946 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1947
1948/*
1949 * Called from scheduler to remove the events of the current task,
1950 * with interrupts disabled.
1951 *
1952 * We stop each event and update the event value in event->count.
1953 *
1954 * This does not protect us against NMI, but disable()
1955 * sets the disabled bit in the control field of event _before_
1956 * accessing the event control register. If a NMI hits, then it will
1957 * not restart the event.
1958 */
1959void __perf_event_task_sched_out(struct task_struct *task,
1960 struct task_struct *next)
1961{
1962 int ctxn;
1963
1964 for_each_task_context_nr(ctxn)
1965 perf_event_context_sched_out(task, ctxn, next);
1966
1967 /*
1968 * if cgroup events exist on this CPU, then we need
1969 * to check if we have to switch out PMU state.
1970 * cgroup event are system-wide mode only
1971 */
1972 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1973 perf_cgroup_sched_out(task);
1974}
1975
1976static void task_ctx_sched_out(struct perf_event_context *ctx,
1977 enum event_type_t event_type)
1978{
1979 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1980
1981 if (!cpuctx->task_ctx)
1982 return;
1983
1984 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1985 return;
1986
1987 ctx_sched_out(ctx, cpuctx, event_type);
1988 cpuctx->task_ctx = NULL;
1989}
1990
1991/*
1992 * Called with IRQs disabled
1993 */
1994static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1995 enum event_type_t event_type)
1996{
1997 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1998}
1999
2000static void
2001ctx_pinned_sched_in(struct perf_event_context *ctx,
2002 struct perf_cpu_context *cpuctx)
2003{
2004 struct perf_event *event;
2005
2006 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2007 if (event->state <= PERF_EVENT_STATE_OFF)
2008 continue;
2009 if (!event_filter_match(event))
2010 continue;
2011
2012 /* may need to reset tstamp_enabled */
2013 if (is_cgroup_event(event))
2014 perf_cgroup_mark_enabled(event, ctx);
2015
2016 if (group_can_go_on(event, cpuctx, 1))
2017 group_sched_in(event, cpuctx, ctx);
2018
2019 /*
2020 * If this pinned group hasn't been scheduled,
2021 * put it in error state.
2022 */
2023 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2024 update_group_times(event);
2025 event->state = PERF_EVENT_STATE_ERROR;
2026 }
2027 }
2028}
2029
2030static void
2031ctx_flexible_sched_in(struct perf_event_context *ctx,
2032 struct perf_cpu_context *cpuctx)
2033{
2034 struct perf_event *event;
2035 int can_add_hw = 1;
2036
2037 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2038 /* Ignore events in OFF or ERROR state */
2039 if (event->state <= PERF_EVENT_STATE_OFF)
2040 continue;
2041 /*
2042 * Listen to the 'cpu' scheduling filter constraint
2043 * of events:
2044 */
2045 if (!event_filter_match(event))
2046 continue;
2047
2048 /* may need to reset tstamp_enabled */
2049 if (is_cgroup_event(event))
2050 perf_cgroup_mark_enabled(event, ctx);
2051
2052 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2053 if (group_sched_in(event, cpuctx, ctx))
2054 can_add_hw = 0;
2055 }
2056 }
2057}
2058
2059static void
2060ctx_sched_in(struct perf_event_context *ctx,
2061 struct perf_cpu_context *cpuctx,
2062 enum event_type_t event_type,
2063 struct task_struct *task)
2064{
2065 u64 now;
2066
2067 raw_spin_lock(&ctx->lock);
2068 ctx->is_active = 1;
2069 if (likely(!ctx->nr_events))
2070 goto out;
2071
2072 now = perf_clock();
2073 ctx->timestamp = now;
2074 perf_cgroup_set_timestamp(task, ctx);
2075 /*
2076 * First go through the list and put on any pinned groups
2077 * in order to give them the best chance of going on.
2078 */
2079 if (event_type & EVENT_PINNED)
2080 ctx_pinned_sched_in(ctx, cpuctx);
2081
2082 /* Then walk through the lower prio flexible groups */
2083 if (event_type & EVENT_FLEXIBLE)
2084 ctx_flexible_sched_in(ctx, cpuctx);
2085
2086out:
2087 raw_spin_unlock(&ctx->lock);
2088}
2089
2090static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2091 enum event_type_t event_type,
2092 struct task_struct *task)
2093{
2094 struct perf_event_context *ctx = &cpuctx->ctx;
2095
2096 ctx_sched_in(ctx, cpuctx, event_type, task);
2097}
2098
2099static void task_ctx_sched_in(struct perf_event_context *ctx,
2100 enum event_type_t event_type)
2101{
2102 struct perf_cpu_context *cpuctx;
2103
2104 cpuctx = __get_cpu_context(ctx);
2105 if (cpuctx->task_ctx == ctx)
2106 return;
2107
2108 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2109 cpuctx->task_ctx = ctx;
2110}
2111
2112static void perf_event_context_sched_in(struct perf_event_context *ctx,
2113 struct task_struct *task)
2114{
2115 struct perf_cpu_context *cpuctx;
2116
2117 cpuctx = __get_cpu_context(ctx);
2118 if (cpuctx->task_ctx == ctx)
2119 return;
2120
2121 perf_pmu_disable(ctx->pmu);
2122 /*
2123 * We want to keep the following priority order:
2124 * cpu pinned (that don't need to move), task pinned,
2125 * cpu flexible, task flexible.
2126 */
2127 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2128
2129 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2130 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2131 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2132
2133 cpuctx->task_ctx = ctx;
2134
2135 /*
2136 * Since these rotations are per-cpu, we need to ensure the
2137 * cpu-context we got scheduled on is actually rotating.
2138 */
2139 perf_pmu_rotate_start(ctx->pmu);
2140 perf_pmu_enable(ctx->pmu);
2141}
2142
2143/*
2144 * Called from scheduler to add the events of the current task
2145 * with interrupts disabled.
2146 *
2147 * We restore the event value and then enable it.
2148 *
2149 * This does not protect us against NMI, but enable()
2150 * sets the enabled bit in the control field of event _before_
2151 * accessing the event control register. If a NMI hits, then it will
2152 * keep the event running.
2153 */
2154void __perf_event_task_sched_in(struct task_struct *task)
2155{
2156 struct perf_event_context *ctx;
2157 int ctxn;
2158
2159 for_each_task_context_nr(ctxn) {
2160 ctx = task->perf_event_ctxp[ctxn];
2161 if (likely(!ctx))
2162 continue;
2163
2164 perf_event_context_sched_in(ctx, task);
2165 }
2166 /*
2167 * if cgroup events exist on this CPU, then we need
2168 * to check if we have to switch in PMU state.
2169 * cgroup event are system-wide mode only
2170 */
2171 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2172 perf_cgroup_sched_in(task);
2173}
2174
2175static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2176{
2177 u64 frequency = event->attr.sample_freq;
2178 u64 sec = NSEC_PER_SEC;
2179 u64 divisor, dividend;
2180
2181 int count_fls, nsec_fls, frequency_fls, sec_fls;
2182
2183 count_fls = fls64(count);
2184 nsec_fls = fls64(nsec);
2185 frequency_fls = fls64(frequency);
2186 sec_fls = 30;
2187
2188 /*
2189 * We got @count in @nsec, with a target of sample_freq HZ
2190 * the target period becomes:
2191 *
2192 * @count * 10^9
2193 * period = -------------------
2194 * @nsec * sample_freq
2195 *
2196 */
2197
2198 /*
2199 * Reduce accuracy by one bit such that @a and @b converge
2200 * to a similar magnitude.
2201 */
2202#define REDUCE_FLS(a, b) \
2203do { \
2204 if (a##_fls > b##_fls) { \
2205 a >>= 1; \
2206 a##_fls--; \
2207 } else { \
2208 b >>= 1; \
2209 b##_fls--; \
2210 } \
2211} while (0)
2212
2213 /*
2214 * Reduce accuracy until either term fits in a u64, then proceed with
2215 * the other, so that finally we can do a u64/u64 division.
2216 */
2217 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2218 REDUCE_FLS(nsec, frequency);
2219 REDUCE_FLS(sec, count);
2220 }
2221
2222 if (count_fls + sec_fls > 64) {
2223 divisor = nsec * frequency;
2224
2225 while (count_fls + sec_fls > 64) {
2226 REDUCE_FLS(count, sec);
2227 divisor >>= 1;
2228 }
2229
2230 dividend = count * sec;
2231 } else {
2232 dividend = count * sec;
2233
2234 while (nsec_fls + frequency_fls > 64) {
2235 REDUCE_FLS(nsec, frequency);
2236 dividend >>= 1;
2237 }
2238
2239 divisor = nsec * frequency;
2240 }
2241
2242 if (!divisor)
2243 return dividend;
2244
2245 return div64_u64(dividend, divisor);
2246}
2247
2248static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
2249{
2250 struct hw_perf_event *hwc = &event->hw;
2251 s64 period, sample_period;
2252 s64 delta;
2253
2254 period = perf_calculate_period(event, nsec, count);
2255
2256 delta = (s64)(period - hwc->sample_period);
2257 delta = (delta + 7) / 8; /* low pass filter */
2258
2259 sample_period = hwc->sample_period + delta;
2260
2261 if (!sample_period)
2262 sample_period = 1;
2263
2264 hwc->sample_period = sample_period;
2265
2266 if (local64_read(&hwc->period_left) > 8*sample_period) {
2267 event->pmu->stop(event, PERF_EF_UPDATE);
2268 local64_set(&hwc->period_left, 0);
2269 event->pmu->start(event, PERF_EF_RELOAD);
2270 }
2271}
2272
2273static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2274{
2275 struct perf_event *event;
2276 struct hw_perf_event *hwc;
2277 u64 interrupts, now;
2278 s64 delta;
2279
2280 raw_spin_lock(&ctx->lock);
2281 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2282 if (event->state != PERF_EVENT_STATE_ACTIVE)
2283 continue;
2284
2285 if (!event_filter_match(event))
2286 continue;
2287
2288 hwc = &event->hw;
2289
2290 interrupts = hwc->interrupts;
2291 hwc->interrupts = 0;
2292
2293 /*
2294 * unthrottle events on the tick
2295 */
2296 if (interrupts == MAX_INTERRUPTS) {
2297 perf_log_throttle(event, 1);
2298 event->pmu->start(event, 0);
2299 }
2300
2301 if (!event->attr.freq || !event->attr.sample_freq)
2302 continue;
2303
2304 event->pmu->read(event);
2305 now = local64_read(&event->count);
2306 delta = now - hwc->freq_count_stamp;
2307 hwc->freq_count_stamp = now;
2308
2309 if (delta > 0)
2310 perf_adjust_period(event, period, delta);
2311 }
2312 raw_spin_unlock(&ctx->lock);
2313}
2314
2315/*
2316 * Round-robin a context's events:
2317 */
2318static void rotate_ctx(struct perf_event_context *ctx)
2319{
2320 raw_spin_lock(&ctx->lock);
2321
2322 /*
2323 * Rotate the first entry last of non-pinned groups. Rotation might be
2324 * disabled by the inheritance code.
2325 */
2326 if (!ctx->rotate_disable)
2327 list_rotate_left(&ctx->flexible_groups);
2328
2329 raw_spin_unlock(&ctx->lock);
2330}
2331
2332/*
2333 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2334 * because they're strictly cpu affine and rotate_start is called with IRQs
2335 * disabled, while rotate_context is called from IRQ context.
2336 */
2337static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2338{
2339 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2340 struct perf_event_context *ctx = NULL;
2341 int rotate = 0, remove = 1;
2342
2343 if (cpuctx->ctx.nr_events) {
2344 remove = 0;
2345 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2346 rotate = 1;
2347 }
2348
2349 ctx = cpuctx->task_ctx;
2350 if (ctx && ctx->nr_events) {
2351 remove = 0;
2352 if (ctx->nr_events != ctx->nr_active)
2353 rotate = 1;
2354 }
2355
2356 perf_pmu_disable(cpuctx->ctx.pmu);
2357 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2358 if (ctx)
2359 perf_ctx_adjust_freq(ctx, interval);
2360
2361 if (!rotate)
2362 goto done;
2363
2364 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2365 if (ctx)
2366 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
2367
2368 rotate_ctx(&cpuctx->ctx);
2369 if (ctx)
2370 rotate_ctx(ctx);
2371
2372 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
2373 if (ctx)
2374 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2375
2376done:
2377 if (remove)
2378 list_del_init(&cpuctx->rotation_list);
2379
2380 perf_pmu_enable(cpuctx->ctx.pmu);
2381}
2382
2383void perf_event_task_tick(void)
2384{
2385 struct list_head *head = &__get_cpu_var(rotation_list);
2386 struct perf_cpu_context *cpuctx, *tmp;
2387
2388 WARN_ON(!irqs_disabled());
2389
2390 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2391 if (cpuctx->jiffies_interval == 1 ||
2392 !(jiffies % cpuctx->jiffies_interval))
2393 perf_rotate_context(cpuctx);
2394 }
2395}
2396
2397static int event_enable_on_exec(struct perf_event *event,
2398 struct perf_event_context *ctx)
2399{
2400 if (!event->attr.enable_on_exec)
2401 return 0;
2402
2403 event->attr.enable_on_exec = 0;
2404 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2405 return 0;
2406
2407 __perf_event_mark_enabled(event, ctx);
2408
2409 return 1;
2410}
2411
2412/*
2413 * Enable all of a task's events that have been marked enable-on-exec.
2414 * This expects task == current.
2415 */
2416static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2417{
2418 struct perf_event *event;
2419 unsigned long flags;
2420 int enabled = 0;
2421 int ret;
2422
2423 local_irq_save(flags);
2424 if (!ctx || !ctx->nr_events)
2425 goto out;
2426
2427 /*
2428 * We must ctxsw out cgroup events to avoid conflict
2429 * when invoking perf_task_event_sched_in() later on
2430 * in this function. Otherwise we end up trying to
2431 * ctxswin cgroup events which are already scheduled
2432 * in.
2433 */
2434 perf_cgroup_sched_out(current);
2435 task_ctx_sched_out(ctx, EVENT_ALL);
2436
2437 raw_spin_lock(&ctx->lock);
2438
2439 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2440 ret = event_enable_on_exec(event, ctx);
2441 if (ret)
2442 enabled = 1;
2443 }
2444
2445 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2446 ret = event_enable_on_exec(event, ctx);
2447 if (ret)
2448 enabled = 1;
2449 }
2450
2451 /*
2452 * Unclone this context if we enabled any event.
2453 */
2454 if (enabled)
2455 unclone_ctx(ctx);
2456
2457 raw_spin_unlock(&ctx->lock);
2458
2459 /*
2460 * Also calls ctxswin for cgroup events, if any:
2461 */
2462 perf_event_context_sched_in(ctx, ctx->task);
2463out:
2464 local_irq_restore(flags);
2465}
2466
2467/*
2468 * Cross CPU call to read the hardware event
2469 */
2470static void __perf_event_read(void *info)
2471{
2472 struct perf_event *event = info;
2473 struct perf_event_context *ctx = event->ctx;
2474 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2475
2476 /*
2477 * If this is a task context, we need to check whether it is
2478 * the current task context of this cpu. If not it has been
2479 * scheduled out before the smp call arrived. In that case
2480 * event->count would have been updated to a recent sample
2481 * when the event was scheduled out.
2482 */
2483 if (ctx->task && cpuctx->task_ctx != ctx)
2484 return;
2485
2486 raw_spin_lock(&ctx->lock);
2487 if (ctx->is_active) {
2488 update_context_time(ctx);
2489 update_cgrp_time_from_event(event);
2490 }
2491 update_event_times(event);
2492 if (event->state == PERF_EVENT_STATE_ACTIVE)
2493 event->pmu->read(event);
2494 raw_spin_unlock(&ctx->lock);
2495}
2496
2497static inline u64 perf_event_count(struct perf_event *event)
2498{
2499 return local64_read(&event->count) + atomic64_read(&event->child_count);
2500}
2501
2502static u64 perf_event_read(struct perf_event *event)
2503{
2504 /*
2505 * If event is enabled and currently active on a CPU, update the
2506 * value in the event structure:
2507 */
2508 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2509 smp_call_function_single(event->oncpu,
2510 __perf_event_read, event, 1);
2511 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2512 struct perf_event_context *ctx = event->ctx;
2513 unsigned long flags;
2514
2515 raw_spin_lock_irqsave(&ctx->lock, flags);
2516 /*
2517 * may read while context is not active
2518 * (e.g., thread is blocked), in that case
2519 * we cannot update context time
2520 */
2521 if (ctx->is_active) {
2522 update_context_time(ctx);
2523 update_cgrp_time_from_event(event);
2524 }
2525 update_event_times(event);
2526 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2527 }
2528
2529 return perf_event_count(event);
2530}
2531
2532/*
2533 * Callchain support
2534 */
2535
2536struct callchain_cpus_entries {
2537 struct rcu_head rcu_head;
2538 struct perf_callchain_entry *cpu_entries[0];
2539};
2540
2541static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2542static atomic_t nr_callchain_events;
2543static DEFINE_MUTEX(callchain_mutex);
2544struct callchain_cpus_entries *callchain_cpus_entries;
2545
2546
2547__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2548 struct pt_regs *regs)
2549{
2550}
2551
2552__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2553 struct pt_regs *regs)
2554{
2555}
2556
2557static void release_callchain_buffers_rcu(struct rcu_head *head)
2558{
2559 struct callchain_cpus_entries *entries;
2560 int cpu;
2561
2562 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2563
2564 for_each_possible_cpu(cpu)
2565 kfree(entries->cpu_entries[cpu]);
2566
2567 kfree(entries);
2568}
2569
2570static void release_callchain_buffers(void)
2571{
2572 struct callchain_cpus_entries *entries;
2573
2574 entries = callchain_cpus_entries;
2575 rcu_assign_pointer(callchain_cpus_entries, NULL);
2576 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2577}
2578
2579static int alloc_callchain_buffers(void)
2580{
2581 int cpu;
2582 int size;
2583 struct callchain_cpus_entries *entries;
2584
2585 /*
2586 * We can't use the percpu allocation API for data that can be
2587 * accessed from NMI. Use a temporary manual per cpu allocation
2588 * until that gets sorted out.
2589 */
2590 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2591
2592 entries = kzalloc(size, GFP_KERNEL);
2593 if (!entries)
2594 return -ENOMEM;
2595
2596 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2597
2598 for_each_possible_cpu(cpu) {
2599 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2600 cpu_to_node(cpu));
2601 if (!entries->cpu_entries[cpu])
2602 goto fail;
2603 }
2604
2605 rcu_assign_pointer(callchain_cpus_entries, entries);
2606
2607 return 0;
2608
2609fail:
2610 for_each_possible_cpu(cpu)
2611 kfree(entries->cpu_entries[cpu]);
2612 kfree(entries);
2613
2614 return -ENOMEM;
2615}
2616
2617static int get_callchain_buffers(void)
2618{
2619 int err = 0;
2620 int count;
2621
2622 mutex_lock(&callchain_mutex);
2623
2624 count = atomic_inc_return(&nr_callchain_events);
2625 if (WARN_ON_ONCE(count < 1)) {
2626 err = -EINVAL;
2627 goto exit;
2628 }
2629
2630 if (count > 1) {
2631 /* If the allocation failed, give up */
2632 if (!callchain_cpus_entries)
2633 err = -ENOMEM;
2634 goto exit;
2635 }
2636
2637 err = alloc_callchain_buffers();
2638 if (err)
2639 release_callchain_buffers();
2640exit:
2641 mutex_unlock(&callchain_mutex);
2642
2643 return err;
2644}
2645
2646static void put_callchain_buffers(void)
2647{
2648 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2649 release_callchain_buffers();
2650 mutex_unlock(&callchain_mutex);
2651 }
2652}
2653
2654static int get_recursion_context(int *recursion)
2655{
2656 int rctx;
2657
2658 if (in_nmi())
2659 rctx = 3;
2660 else if (in_irq())
2661 rctx = 2;
2662 else if (in_softirq())
2663 rctx = 1;
2664 else
2665 rctx = 0;
2666
2667 if (recursion[rctx])
2668 return -1;
2669
2670 recursion[rctx]++;
2671 barrier();
2672
2673 return rctx;
2674}
2675
2676static inline void put_recursion_context(int *recursion, int rctx)
2677{
2678 barrier();
2679 recursion[rctx]--;
2680}
2681
2682static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2683{
2684 int cpu;
2685 struct callchain_cpus_entries *entries;
2686
2687 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2688 if (*rctx == -1)
2689 return NULL;
2690
2691 entries = rcu_dereference(callchain_cpus_entries);
2692 if (!entries)
2693 return NULL;
2694
2695 cpu = smp_processor_id();
2696
2697 return &entries->cpu_entries[cpu][*rctx];
2698}
2699
2700static void
2701put_callchain_entry(int rctx)
2702{
2703 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2704}
2705
2706static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2707{
2708 int rctx;
2709 struct perf_callchain_entry *entry;
2710
2711
2712 entry = get_callchain_entry(&rctx);
2713 if (rctx == -1)
2714 return NULL;
2715
2716 if (!entry)
2717 goto exit_put;
2718
2719 entry->nr = 0;
2720
2721 if (!user_mode(regs)) {
2722 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2723 perf_callchain_kernel(entry, regs);
2724 if (current->mm)
2725 regs = task_pt_regs(current);
2726 else
2727 regs = NULL;
2728 }
2729
2730 if (regs) {
2731 perf_callchain_store(entry, PERF_CONTEXT_USER);
2732 perf_callchain_user(entry, regs);
2733 }
2734
2735exit_put:
2736 put_callchain_entry(rctx);
2737
2738 return entry;
2739}
2740
2741/*
2742 * Initialize the perf_event context in a task_struct:
2743 */
2744static void __perf_event_init_context(struct perf_event_context *ctx)
2745{
2746 raw_spin_lock_init(&ctx->lock);
2747 mutex_init(&ctx->mutex);
2748 INIT_LIST_HEAD(&ctx->pinned_groups);
2749 INIT_LIST_HEAD(&ctx->flexible_groups);
2750 INIT_LIST_HEAD(&ctx->event_list);
2751 atomic_set(&ctx->refcount, 1);
2752}
2753
2754static struct perf_event_context *
2755alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2756{
2757 struct perf_event_context *ctx;
2758
2759 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2760 if (!ctx)
2761 return NULL;
2762
2763 __perf_event_init_context(ctx);
2764 if (task) {
2765 ctx->task = task;
2766 get_task_struct(task);
2767 }
2768 ctx->pmu = pmu;
2769
2770 return ctx;
2771}
2772
2773static struct task_struct *
2774find_lively_task_by_vpid(pid_t vpid)
2775{
2776 struct task_struct *task;
2777 int err;
2778
2779 rcu_read_lock();
2780 if (!vpid)
2781 task = current;
2782 else
2783 task = find_task_by_vpid(vpid);
2784 if (task)
2785 get_task_struct(task);
2786 rcu_read_unlock();
2787
2788 if (!task)
2789 return ERR_PTR(-ESRCH);
2790
2791 /* Reuse ptrace permission checks for now. */
2792 err = -EACCES;
2793 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2794 goto errout;
2795
2796 return task;
2797errout:
2798 put_task_struct(task);
2799 return ERR_PTR(err);
2800
2801}
2802
2803/*
2804 * Returns a matching context with refcount and pincount.
2805 */
2806static struct perf_event_context *
2807find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2808{
2809 struct perf_event_context *ctx;
2810 struct perf_cpu_context *cpuctx;
2811 unsigned long flags;
2812 int ctxn, err;
2813
2814 if (!task) {
2815 /* Must be root to operate on a CPU event: */
2816 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2817 return ERR_PTR(-EACCES);
2818
2819 /*
2820 * We could be clever and allow to attach a event to an
2821 * offline CPU and activate it when the CPU comes up, but
2822 * that's for later.
2823 */
2824 if (!cpu_online(cpu))
2825 return ERR_PTR(-ENODEV);
2826
2827 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2828 ctx = &cpuctx->ctx;
2829 get_ctx(ctx);
2830 ++ctx->pin_count;
2831
2832 return ctx;
2833 }
2834
2835 err = -EINVAL;
2836 ctxn = pmu->task_ctx_nr;
2837 if (ctxn < 0)
2838 goto errout;
2839
2840retry:
2841 ctx = perf_lock_task_context(task, ctxn, &flags);
2842 if (ctx) {
2843 unclone_ctx(ctx);
2844 ++ctx->pin_count;
2845 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2846 }
2847
2848 if (!ctx) {
2849 ctx = alloc_perf_context(pmu, task);
2850 err = -ENOMEM;
2851 if (!ctx)
2852 goto errout;
2853
2854 get_ctx(ctx);
2855
2856 err = 0;
2857 mutex_lock(&task->perf_event_mutex);
2858 /*
2859 * If it has already passed perf_event_exit_task().
2860 * we must see PF_EXITING, it takes this mutex too.
2861 */
2862 if (task->flags & PF_EXITING)
2863 err = -ESRCH;
2864 else if (task->perf_event_ctxp[ctxn])
2865 err = -EAGAIN;
2866 else {
2867 ++ctx->pin_count;
2868 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2869 }
2870 mutex_unlock(&task->perf_event_mutex);
2871
2872 if (unlikely(err)) {
2873 put_task_struct(task);
2874 kfree(ctx);
2875
2876 if (err == -EAGAIN)
2877 goto retry;
2878 goto errout;
2879 }
2880 }
2881
2882 return ctx;
2883
2884errout:
2885 return ERR_PTR(err);
2886}
2887
2888static void perf_event_free_filter(struct perf_event *event);
2889
2890static void free_event_rcu(struct rcu_head *head)
2891{
2892 struct perf_event *event;
2893
2894 event = container_of(head, struct perf_event, rcu_head);
2895 if (event->ns)
2896 put_pid_ns(event->ns);
2897 perf_event_free_filter(event);
2898 kfree(event);
2899}
2900
2901static void perf_buffer_put(struct perf_buffer *buffer);
2902
2903static void free_event(struct perf_event *event)
2904{
2905 irq_work_sync(&event->pending);
2906
2907 if (!event->parent) {
2908 if (event->attach_state & PERF_ATTACH_TASK)
2909 jump_label_dec(&perf_sched_events);
2910 if (event->attr.mmap || event->attr.mmap_data)
2911 atomic_dec(&nr_mmap_events);
2912 if (event->attr.comm)
2913 atomic_dec(&nr_comm_events);
2914 if (event->attr.task)
2915 atomic_dec(&nr_task_events);
2916 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2917 put_callchain_buffers();
2918 if (is_cgroup_event(event)) {
2919 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2920 jump_label_dec(&perf_sched_events);
2921 }
2922 }
2923
2924 if (event->buffer) {
2925 perf_buffer_put(event->buffer);
2926 event->buffer = NULL;
2927 }
2928
2929 if (is_cgroup_event(event))
2930 perf_detach_cgroup(event);
2931
2932 if (event->destroy)
2933 event->destroy(event);
2934
2935 if (event->ctx)
2936 put_ctx(event->ctx);
2937
2938 call_rcu(&event->rcu_head, free_event_rcu);
2939}
2940
2941int perf_event_release_kernel(struct perf_event *event)
2942{
2943 struct perf_event_context *ctx = event->ctx;
2944
2945 /*
2946 * Remove from the PMU, can't get re-enabled since we got
2947 * here because the last ref went.
2948 */
2949 perf_event_disable(event);
2950
2951 WARN_ON_ONCE(ctx->parent_ctx);
2952 /*
2953 * There are two ways this annotation is useful:
2954 *
2955 * 1) there is a lock recursion from perf_event_exit_task
2956 * see the comment there.
2957 *
2958 * 2) there is a lock-inversion with mmap_sem through
2959 * perf_event_read_group(), which takes faults while
2960 * holding ctx->mutex, however this is called after
2961 * the last filedesc died, so there is no possibility
2962 * to trigger the AB-BA case.
2963 */
2964 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2965 raw_spin_lock_irq(&ctx->lock);
2966 perf_group_detach(event);
2967 list_del_event(event, ctx);
2968 raw_spin_unlock_irq(&ctx->lock);
2969 mutex_unlock(&ctx->mutex);
2970
2971 free_event(event);
2972
2973 return 0;
2974}
2975EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2976
2977/*
2978 * Called when the last reference to the file is gone.
2979 */
2980static int perf_release(struct inode *inode, struct file *file)
2981{
2982 struct perf_event *event = file->private_data;
2983 struct task_struct *owner;
2984
2985 file->private_data = NULL;
2986
2987 rcu_read_lock();
2988 owner = ACCESS_ONCE(event->owner);
2989 /*
2990 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2991 * !owner it means the list deletion is complete and we can indeed
2992 * free this event, otherwise we need to serialize on
2993 * owner->perf_event_mutex.
2994 */
2995 smp_read_barrier_depends();
2996 if (owner) {
2997 /*
2998 * Since delayed_put_task_struct() also drops the last
2999 * task reference we can safely take a new reference
3000 * while holding the rcu_read_lock().
3001 */
3002 get_task_struct(owner);
3003 }
3004 rcu_read_unlock();
3005
3006 if (owner) {
3007 mutex_lock(&owner->perf_event_mutex);
3008 /*
3009 * We have to re-check the event->owner field, if it is cleared
3010 * we raced with perf_event_exit_task(), acquiring the mutex
3011 * ensured they're done, and we can proceed with freeing the
3012 * event.
3013 */
3014 if (event->owner)
3015 list_del_init(&event->owner_entry);
3016 mutex_unlock(&owner->perf_event_mutex);
3017 put_task_struct(owner);
3018 }
3019
3020 return perf_event_release_kernel(event);
3021}
3022
3023u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3024{
3025 struct perf_event *child;
3026 u64 total = 0;
3027
3028 *enabled = 0;
3029 *running = 0;
3030
3031 mutex_lock(&event->child_mutex);
3032 total += perf_event_read(event);
3033 *enabled += event->total_time_enabled +
3034 atomic64_read(&event->child_total_time_enabled);
3035 *running += event->total_time_running +
3036 atomic64_read(&event->child_total_time_running);
3037
3038 list_for_each_entry(child, &event->child_list, child_list) {
3039 total += perf_event_read(child);
3040 *enabled += child->total_time_enabled;
3041 *running += child->total_time_running;
3042 }
3043 mutex_unlock(&event->child_mutex);
3044
3045 return total;
3046}
3047EXPORT_SYMBOL_GPL(perf_event_read_value);
3048
3049static int perf_event_read_group(struct perf_event *event,
3050 u64 read_format, char __user *buf)
3051{
3052 struct perf_event *leader = event->group_leader, *sub;
3053 int n = 0, size = 0, ret = -EFAULT;
3054 struct perf_event_context *ctx = leader->ctx;
3055 u64 values[5];
3056 u64 count, enabled, running;
3057
3058 mutex_lock(&ctx->mutex);
3059 count = perf_event_read_value(leader, &enabled, &running);
3060
3061 values[n++] = 1 + leader->nr_siblings;
3062 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3063 values[n++] = enabled;
3064 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3065 values[n++] = running;
3066 values[n++] = count;
3067 if (read_format & PERF_FORMAT_ID)
3068 values[n++] = primary_event_id(leader);
3069
3070 size = n * sizeof(u64);
3071
3072 if (copy_to_user(buf, values, size))
3073 goto unlock;
3074
3075 ret = size;
3076
3077 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3078 n = 0;
3079
3080 values[n++] = perf_event_read_value(sub, &enabled, &running);
3081 if (read_format & PERF_FORMAT_ID)
3082 values[n++] = primary_event_id(sub);
3083
3084 size = n * sizeof(u64);
3085
3086 if (copy_to_user(buf + ret, values, size)) {
3087 ret = -EFAULT;
3088 goto unlock;
3089 }
3090
3091 ret += size;
3092 }
3093unlock:
3094 mutex_unlock(&ctx->mutex);
3095
3096 return ret;
3097}
3098
3099static int perf_event_read_one(struct perf_event *event,
3100 u64 read_format, char __user *buf)
3101{
3102 u64 enabled, running;
3103 u64 values[4];
3104 int n = 0;
3105
3106 values[n++] = perf_event_read_value(event, &enabled, &running);
3107 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3108 values[n++] = enabled;
3109 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3110 values[n++] = running;
3111 if (read_format & PERF_FORMAT_ID)
3112 values[n++] = primary_event_id(event);
3113
3114 if (copy_to_user(buf, values, n * sizeof(u64)))
3115 return -EFAULT;
3116
3117 return n * sizeof(u64);
3118}
3119
3120/*
3121 * Read the performance event - simple non blocking version for now
3122 */
3123static ssize_t
3124perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3125{
3126 u64 read_format = event->attr.read_format;
3127 int ret;
3128
3129 /*
3130 * Return end-of-file for a read on a event that is in
3131 * error state (i.e. because it was pinned but it couldn't be
3132 * scheduled on to the CPU at some point).
3133 */
3134 if (event->state == PERF_EVENT_STATE_ERROR)
3135 return 0;
3136
3137 if (count < event->read_size)
3138 return -ENOSPC;
3139
3140 WARN_ON_ONCE(event->ctx->parent_ctx);
3141 if (read_format & PERF_FORMAT_GROUP)
3142 ret = perf_event_read_group(event, read_format, buf);
3143 else
3144 ret = perf_event_read_one(event, read_format, buf);
3145
3146 return ret;
3147}
3148
3149static ssize_t
3150perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3151{
3152 struct perf_event *event = file->private_data;
3153
3154 return perf_read_hw(event, buf, count);
3155}
3156
3157static unsigned int perf_poll(struct file *file, poll_table *wait)
3158{
3159 struct perf_event *event = file->private_data;
3160 struct perf_buffer *buffer;
3161 unsigned int events = POLL_HUP;
3162
3163 rcu_read_lock();
3164 buffer = rcu_dereference(event->buffer);
3165 if (buffer)
3166 events = atomic_xchg(&buffer->poll, 0);
3167 rcu_read_unlock();
3168
3169 poll_wait(file, &event->waitq, wait);
3170
3171 return events;
3172}
3173
3174static void perf_event_reset(struct perf_event *event)
3175{
3176 (void)perf_event_read(event);
3177 local64_set(&event->count, 0);
3178 perf_event_update_userpage(event);
3179}
3180
3181/*
3182 * Holding the top-level event's child_mutex means that any
3183 * descendant process that has inherited this event will block
3184 * in sync_child_event if it goes to exit, thus satisfying the
3185 * task existence requirements of perf_event_enable/disable.
3186 */
3187static void perf_event_for_each_child(struct perf_event *event,
3188 void (*func)(struct perf_event *))
3189{
3190 struct perf_event *child;
3191
3192 WARN_ON_ONCE(event->ctx->parent_ctx);
3193 mutex_lock(&event->child_mutex);
3194 func(event);
3195 list_for_each_entry(child, &event->child_list, child_list)
3196 func(child);
3197 mutex_unlock(&event->child_mutex);
3198}
3199
3200static void perf_event_for_each(struct perf_event *event,
3201 void (*func)(struct perf_event *))
3202{
3203 struct perf_event_context *ctx = event->ctx;
3204 struct perf_event *sibling;
3205
3206 WARN_ON_ONCE(ctx->parent_ctx);
3207 mutex_lock(&ctx->mutex);
3208 event = event->group_leader;
3209
3210 perf_event_for_each_child(event, func);
3211 func(event);
3212 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3213 perf_event_for_each_child(event, func);
3214 mutex_unlock(&ctx->mutex);
3215}
3216
3217static int perf_event_period(struct perf_event *event, u64 __user *arg)
3218{
3219 struct perf_event_context *ctx = event->ctx;
3220 int ret = 0;
3221 u64 value;
3222
3223 if (!is_sampling_event(event))
3224 return -EINVAL;
3225
3226 if (copy_from_user(&value, arg, sizeof(value)))
3227 return -EFAULT;
3228
3229 if (!value)
3230 return -EINVAL;
3231
3232 raw_spin_lock_irq(&ctx->lock);
3233 if (event->attr.freq) {
3234 if (value > sysctl_perf_event_sample_rate) {
3235 ret = -EINVAL;
3236 goto unlock;
3237 }
3238
3239 event->attr.sample_freq = value;
3240 } else {
3241 event->attr.sample_period = value;
3242 event->hw.sample_period = value;
3243 }
3244unlock:
3245 raw_spin_unlock_irq(&ctx->lock);
3246
3247 return ret;
3248}
3249
3250static const struct file_operations perf_fops;
3251
3252static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3253{
3254 struct file *file;
3255
3256 file = fget_light(fd, fput_needed);
3257 if (!file)
3258 return ERR_PTR(-EBADF);
3259
3260 if (file->f_op != &perf_fops) {
3261 fput_light(file, *fput_needed);
3262 *fput_needed = 0;
3263 return ERR_PTR(-EBADF);
3264 }
3265
3266 return file->private_data;
3267}
3268
3269static int perf_event_set_output(struct perf_event *event,
3270 struct perf_event *output_event);
3271static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3272
3273static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3274{
3275 struct perf_event *event = file->private_data;
3276 void (*func)(struct perf_event *);
3277 u32 flags = arg;
3278
3279 switch (cmd) {
3280 case PERF_EVENT_IOC_ENABLE:
3281 func = perf_event_enable;
3282 break;
3283 case PERF_EVENT_IOC_DISABLE:
3284 func = perf_event_disable;
3285 break;
3286 case PERF_EVENT_IOC_RESET:
3287 func = perf_event_reset;
3288 break;
3289
3290 case PERF_EVENT_IOC_REFRESH:
3291 return perf_event_refresh(event, arg);
3292
3293 case PERF_EVENT_IOC_PERIOD:
3294 return perf_event_period(event, (u64 __user *)arg);
3295
3296 case PERF_EVENT_IOC_SET_OUTPUT:
3297 {
3298 struct perf_event *output_event = NULL;
3299 int fput_needed = 0;
3300 int ret;
3301
3302 if (arg != -1) {
3303 output_event = perf_fget_light(arg, &fput_needed);
3304 if (IS_ERR(output_event))
3305 return PTR_ERR(output_event);
3306 }
3307
3308 ret = perf_event_set_output(event, output_event);
3309 if (output_event)
3310 fput_light(output_event->filp, fput_needed);
3311
3312 return ret;
3313 }
3314
3315 case PERF_EVENT_IOC_SET_FILTER:
3316 return perf_event_set_filter(event, (void __user *)arg);
3317
3318 default:
3319 return -ENOTTY;
3320 }
3321
3322 if (flags & PERF_IOC_FLAG_GROUP)
3323 perf_event_for_each(event, func);
3324 else
3325 perf_event_for_each_child(event, func);
3326
3327 return 0;
3328}
3329
3330int perf_event_task_enable(void)
3331{
3332 struct perf_event *event;
3333
3334 mutex_lock(&current->perf_event_mutex);
3335 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3336 perf_event_for_each_child(event, perf_event_enable);
3337 mutex_unlock(&current->perf_event_mutex);
3338
3339 return 0;
3340}
3341
3342int perf_event_task_disable(void)
3343{
3344 struct perf_event *event;
3345
3346 mutex_lock(&current->perf_event_mutex);
3347 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3348 perf_event_for_each_child(event, perf_event_disable);
3349 mutex_unlock(&current->perf_event_mutex);
3350
3351 return 0;
3352}
3353
3354#ifndef PERF_EVENT_INDEX_OFFSET
3355# define PERF_EVENT_INDEX_OFFSET 0
3356#endif
3357
3358static int perf_event_index(struct perf_event *event)
3359{
3360 if (event->hw.state & PERF_HES_STOPPED)
3361 return 0;
3362
3363 if (event->state != PERF_EVENT_STATE_ACTIVE)
3364 return 0;
3365
3366 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3367}
3368
3369/*
3370 * Callers need to ensure there can be no nesting of this function, otherwise
3371 * the seqlock logic goes bad. We can not serialize this because the arch
3372 * code calls this from NMI context.
3373 */
3374void perf_event_update_userpage(struct perf_event *event)
3375{
3376 struct perf_event_mmap_page *userpg;
3377 struct perf_buffer *buffer;
3378
3379 rcu_read_lock();
3380 buffer = rcu_dereference(event->buffer);
3381 if (!buffer)
3382 goto unlock;
3383
3384 userpg = buffer->user_page;
3385
3386 /*
3387 * Disable preemption so as to not let the corresponding user-space
3388 * spin too long if we get preempted.
3389 */
3390 preempt_disable();
3391 ++userpg->lock;
3392 barrier();
3393 userpg->index = perf_event_index(event);
3394 userpg->offset = perf_event_count(event);
3395 if (event->state == PERF_EVENT_STATE_ACTIVE)
3396 userpg->offset -= local64_read(&event->hw.prev_count);
3397
3398 userpg->time_enabled = event->total_time_enabled +
3399 atomic64_read(&event->child_total_time_enabled);
3400
3401 userpg->time_running = event->total_time_running +
3402 atomic64_read(&event->child_total_time_running);
3403
3404 barrier();
3405 ++userpg->lock;
3406 preempt_enable();
3407unlock:
3408 rcu_read_unlock();
3409}
3410
3411static unsigned long perf_data_size(struct perf_buffer *buffer);
3412
3413static void
3414perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3415{
3416 long max_size = perf_data_size(buffer);
3417
3418 if (watermark)
3419 buffer->watermark = min(max_size, watermark);
3420
3421 if (!buffer->watermark)
3422 buffer->watermark = max_size / 2;
3423
3424 if (flags & PERF_BUFFER_WRITABLE)
3425 buffer->writable = 1;
3426
3427 atomic_set(&buffer->refcount, 1);
3428}
3429
3430#ifndef CONFIG_PERF_USE_VMALLOC
3431
3432/*
3433 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3434 */
3435
3436static struct page *
3437perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3438{
3439 if (pgoff > buffer->nr_pages)
3440 return NULL;
3441
3442 if (pgoff == 0)
3443 return virt_to_page(buffer->user_page);
3444
3445 return virt_to_page(buffer->data_pages[pgoff - 1]);
3446}
3447
3448static void *perf_mmap_alloc_page(int cpu)
3449{
3450 struct page *page;
3451 int node;
3452
3453 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3454 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3455 if (!page)
3456 return NULL;
3457
3458 return page_address(page);
3459}
3460
3461static struct perf_buffer *
3462perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3463{
3464 struct perf_buffer *buffer;
3465 unsigned long size;
3466 int i;
3467
3468 size = sizeof(struct perf_buffer);
3469 size += nr_pages * sizeof(void *);
3470
3471 buffer = kzalloc(size, GFP_KERNEL);
3472 if (!buffer)
3473 goto fail;
3474
3475 buffer->user_page = perf_mmap_alloc_page(cpu);
3476 if (!buffer->user_page)
3477 goto fail_user_page;
3478
3479 for (i = 0; i < nr_pages; i++) {
3480 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3481 if (!buffer->data_pages[i])
3482 goto fail_data_pages;
3483 }
3484
3485 buffer->nr_pages = nr_pages;
3486
3487 perf_buffer_init(buffer, watermark, flags);
3488
3489 return buffer;
3490
3491fail_data_pages:
3492 for (i--; i >= 0; i--)
3493 free_page((unsigned long)buffer->data_pages[i]);
3494
3495 free_page((unsigned long)buffer->user_page);
3496
3497fail_user_page:
3498 kfree(buffer);
3499
3500fail:
3501 return NULL;
3502}
3503
3504static void perf_mmap_free_page(unsigned long addr)
3505{
3506 struct page *page = virt_to_page((void *)addr);
3507
3508 page->mapping = NULL;
3509 __free_page(page);
3510}
3511
3512static void perf_buffer_free(struct perf_buffer *buffer)
3513{
3514 int i;
3515
3516 perf_mmap_free_page((unsigned long)buffer->user_page);
3517 for (i = 0; i < buffer->nr_pages; i++)
3518 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3519 kfree(buffer);
3520}
3521
3522static inline int page_order(struct perf_buffer *buffer)
3523{
3524 return 0;
3525}
3526
3527#else
3528
3529/*
3530 * Back perf_mmap() with vmalloc memory.
3531 *
3532 * Required for architectures that have d-cache aliasing issues.
3533 */
3534
3535static inline int page_order(struct perf_buffer *buffer)
3536{
3537 return buffer->page_order;
3538}
3539
3540static struct page *
3541perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3542{
3543 if (pgoff > (1UL << page_order(buffer)))
3544 return NULL;
3545
3546 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3547}
3548
3549static void perf_mmap_unmark_page(void *addr)
3550{
3551 struct page *page = vmalloc_to_page(addr);
3552
3553 page->mapping = NULL;
3554}
3555
3556static void perf_buffer_free_work(struct work_struct *work)
3557{
3558 struct perf_buffer *buffer;
3559 void *base;
3560 int i, nr;
3561
3562 buffer = container_of(work, struct perf_buffer, work);
3563 nr = 1 << page_order(buffer);
3564
3565 base = buffer->user_page;
3566 for (i = 0; i < nr + 1; i++)
3567 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3568
3569 vfree(base);
3570 kfree(buffer);
3571}
3572
3573static void perf_buffer_free(struct perf_buffer *buffer)
3574{
3575 schedule_work(&buffer->work);
3576}
3577
3578static struct perf_buffer *
3579perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3580{
3581 struct perf_buffer *buffer;
3582 unsigned long size;
3583 void *all_buf;
3584
3585 size = sizeof(struct perf_buffer);
3586 size += sizeof(void *);
3587
3588 buffer = kzalloc(size, GFP_KERNEL);
3589 if (!buffer)
3590 goto fail;
3591
3592 INIT_WORK(&buffer->work, perf_buffer_free_work);
3593
3594 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3595 if (!all_buf)
3596 goto fail_all_buf;
3597
3598 buffer->user_page = all_buf;
3599 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3600 buffer->page_order = ilog2(nr_pages);
3601 buffer->nr_pages = 1;
3602
3603 perf_buffer_init(buffer, watermark, flags);
3604
3605 return buffer;
3606
3607fail_all_buf:
3608 kfree(buffer);
3609
3610fail:
3611 return NULL;
3612}
3613
3614#endif
3615
3616static unsigned long perf_data_size(struct perf_buffer *buffer)
3617{
3618 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3619}
3620
3621static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3622{
3623 struct perf_event *event = vma->vm_file->private_data;
3624 struct perf_buffer *buffer;
3625 int ret = VM_FAULT_SIGBUS;
3626
3627 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3628 if (vmf->pgoff == 0)
3629 ret = 0;
3630 return ret;
3631 }
3632
3633 rcu_read_lock();
3634 buffer = rcu_dereference(event->buffer);
3635 if (!buffer)
3636 goto unlock;
3637
3638 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3639 goto unlock;
3640
3641 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
3642 if (!vmf->page)
3643 goto unlock;
3644
3645 get_page(vmf->page);
3646 vmf->page->mapping = vma->vm_file->f_mapping;
3647 vmf->page->index = vmf->pgoff;
3648
3649 ret = 0;
3650unlock:
3651 rcu_read_unlock();
3652
3653 return ret;
3654}
3655
3656static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
3657{
3658 struct perf_buffer *buffer;
3659
3660 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
3661 perf_buffer_free(buffer);
3662}
3663
3664static struct perf_buffer *perf_buffer_get(struct perf_event *event)
3665{
3666 struct perf_buffer *buffer;
3667
3668 rcu_read_lock();
3669 buffer = rcu_dereference(event->buffer);
3670 if (buffer) {
3671 if (!atomic_inc_not_zero(&buffer->refcount))
3672 buffer = NULL;
3673 }
3674 rcu_read_unlock();
3675
3676 return buffer;
3677}
3678
3679static void perf_buffer_put(struct perf_buffer *buffer)
3680{
3681 if (!atomic_dec_and_test(&buffer->refcount))
3682 return;
3683
3684 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
3685}
3686
3687static void perf_mmap_open(struct vm_area_struct *vma)
3688{
3689 struct perf_event *event = vma->vm_file->private_data;
3690
3691 atomic_inc(&event->mmap_count);
3692}
3693
3694static void perf_mmap_close(struct vm_area_struct *vma)
3695{
3696 struct perf_event *event = vma->vm_file->private_data;
3697
3698 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3699 unsigned long size = perf_data_size(event->buffer);
3700 struct user_struct *user = event->mmap_user;
3701 struct perf_buffer *buffer = event->buffer;
3702
3703 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3704 vma->vm_mm->locked_vm -= event->mmap_locked;
3705 rcu_assign_pointer(event->buffer, NULL);
3706 mutex_unlock(&event->mmap_mutex);
3707
3708 perf_buffer_put(buffer);
3709 free_uid(user);
3710 }
3711}
3712
3713static const struct vm_operations_struct perf_mmap_vmops = {
3714 .open = perf_mmap_open,
3715 .close = perf_mmap_close,
3716 .fault = perf_mmap_fault,
3717 .page_mkwrite = perf_mmap_fault,
3718};
3719
3720static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3721{
3722 struct perf_event *event = file->private_data;
3723 unsigned long user_locked, user_lock_limit;
3724 struct user_struct *user = current_user();
3725 unsigned long locked, lock_limit;
3726 struct perf_buffer *buffer;
3727 unsigned long vma_size;
3728 unsigned long nr_pages;
3729 long user_extra, extra;
3730 int ret = 0, flags = 0;
3731
3732 /*
3733 * Don't allow mmap() of inherited per-task counters. This would
3734 * create a performance issue due to all children writing to the
3735 * same buffer.
3736 */
3737 if (event->cpu == -1 && event->attr.inherit)
3738 return -EINVAL;
3739
3740 if (!(vma->vm_flags & VM_SHARED))
3741 return -EINVAL;
3742
3743 vma_size = vma->vm_end - vma->vm_start;
3744 nr_pages = (vma_size / PAGE_SIZE) - 1;
3745
3746 /*
3747 * If we have buffer pages ensure they're a power-of-two number, so we
3748 * can do bitmasks instead of modulo.
3749 */
3750 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3751 return -EINVAL;
3752
3753 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3754 return -EINVAL;
3755
3756 if (vma->vm_pgoff != 0)
3757 return -EINVAL;
3758
3759 WARN_ON_ONCE(event->ctx->parent_ctx);
3760 mutex_lock(&event->mmap_mutex);
3761 if (event->buffer) {
3762 if (event->buffer->nr_pages == nr_pages)
3763 atomic_inc(&event->buffer->refcount);
3764 else
3765 ret = -EINVAL;
3766 goto unlock;
3767 }
3768
3769 user_extra = nr_pages + 1;
3770 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3771
3772 /*
3773 * Increase the limit linearly with more CPUs:
3774 */
3775 user_lock_limit *= num_online_cpus();
3776
3777 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3778
3779 extra = 0;
3780 if (user_locked > user_lock_limit)
3781 extra = user_locked - user_lock_limit;
3782
3783 lock_limit = rlimit(RLIMIT_MEMLOCK);
3784 lock_limit >>= PAGE_SHIFT;
3785 locked = vma->vm_mm->locked_vm + extra;
3786
3787 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3788 !capable(CAP_IPC_LOCK)) {
3789 ret = -EPERM;
3790 goto unlock;
3791 }
3792
3793 WARN_ON(event->buffer);
3794
3795 if (vma->vm_flags & VM_WRITE)
3796 flags |= PERF_BUFFER_WRITABLE;
3797
3798 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
3799 event->cpu, flags);
3800 if (!buffer) {
3801 ret = -ENOMEM;
3802 goto unlock;
3803 }
3804 rcu_assign_pointer(event->buffer, buffer);
3805
3806 atomic_long_add(user_extra, &user->locked_vm);
3807 event->mmap_locked = extra;
3808 event->mmap_user = get_current_user();
3809 vma->vm_mm->locked_vm += event->mmap_locked;
3810
3811unlock:
3812 if (!ret)
3813 atomic_inc(&event->mmap_count);
3814 mutex_unlock(&event->mmap_mutex);
3815
3816 vma->vm_flags |= VM_RESERVED;
3817 vma->vm_ops = &perf_mmap_vmops;
3818
3819 return ret;
3820}
3821
3822static int perf_fasync(int fd, struct file *filp, int on)
3823{
3824 struct inode *inode = filp->f_path.dentry->d_inode;
3825 struct perf_event *event = filp->private_data;
3826 int retval;
3827
3828 mutex_lock(&inode->i_mutex);
3829 retval = fasync_helper(fd, filp, on, &event->fasync);
3830 mutex_unlock(&inode->i_mutex);
3831
3832 if (retval < 0)
3833 return retval;
3834
3835 return 0;
3836}
3837
3838static const struct file_operations perf_fops = {
3839 .llseek = no_llseek,
3840 .release = perf_release,
3841 .read = perf_read,
3842 .poll = perf_poll,
3843 .unlocked_ioctl = perf_ioctl,
3844 .compat_ioctl = perf_ioctl,
3845 .mmap = perf_mmap,
3846 .fasync = perf_fasync,
3847};
3848
3849/*
3850 * Perf event wakeup
3851 *
3852 * If there's data, ensure we set the poll() state and publish everything
3853 * to user-space before waking everybody up.
3854 */
3855
3856void perf_event_wakeup(struct perf_event *event)
3857{
3858 wake_up_all(&event->waitq);
3859
3860 if (event->pending_kill) {
3861 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3862 event->pending_kill = 0;
3863 }
3864}
3865
3866static void perf_pending_event(struct irq_work *entry)
3867{
3868 struct perf_event *event = container_of(entry,
3869 struct perf_event, pending);
3870
3871 if (event->pending_disable) {
3872 event->pending_disable = 0;
3873 __perf_event_disable(event);
3874 }
3875
3876 if (event->pending_wakeup) {
3877 event->pending_wakeup = 0;
3878 perf_event_wakeup(event);
3879 }
3880}
3881
3882/*
3883 * We assume there is only KVM supporting the callbacks.
3884 * Later on, we might change it to a list if there is
3885 * another virtualization implementation supporting the callbacks.
3886 */
3887struct perf_guest_info_callbacks *perf_guest_cbs;
3888
3889int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3890{
3891 perf_guest_cbs = cbs;
3892 return 0;
3893}
3894EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3895
3896int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3897{
3898 perf_guest_cbs = NULL;
3899 return 0;
3900}
3901EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3902
3903/*
3904 * Output
3905 */
3906static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3907 unsigned long offset, unsigned long head)
3908{
3909 unsigned long mask;
3910
3911 if (!buffer->writable)
3912 return true;
3913
3914 mask = perf_data_size(buffer) - 1;
3915
3916 offset = (offset - tail) & mask;
3917 head = (head - tail) & mask;
3918
3919 if ((int)(head - offset) < 0)
3920 return false;
3921
3922 return true;
3923}
3924
3925static void perf_output_wakeup(struct perf_output_handle *handle)
3926{
3927 atomic_set(&handle->buffer->poll, POLL_IN);
3928
3929 if (handle->nmi) {
3930 handle->event->pending_wakeup = 1;
3931 irq_work_queue(&handle->event->pending);
3932 } else
3933 perf_event_wakeup(handle->event);
3934}
3935
3936/*
3937 * We need to ensure a later event_id doesn't publish a head when a former
3938 * event isn't done writing. However since we need to deal with NMIs we
3939 * cannot fully serialize things.
3940 *
3941 * We only publish the head (and generate a wakeup) when the outer-most
3942 * event completes.
3943 */
3944static void perf_output_get_handle(struct perf_output_handle *handle)
3945{
3946 struct perf_buffer *buffer = handle->buffer;
3947
3948 preempt_disable();
3949 local_inc(&buffer->nest);
3950 handle->wakeup = local_read(&buffer->wakeup);
3951}
3952
3953static void perf_output_put_handle(struct perf_output_handle *handle)
3954{
3955 struct perf_buffer *buffer = handle->buffer;
3956 unsigned long head;
3957
3958again:
3959 head = local_read(&buffer->head);
3960
3961 /*
3962 * IRQ/NMI can happen here, which means we can miss a head update.
3963 */
3964
3965 if (!local_dec_and_test(&buffer->nest))
3966 goto out;
3967
3968 /*
3969 * Publish the known good head. Rely on the full barrier implied
3970 * by atomic_dec_and_test() order the buffer->head read and this
3971 * write.
3972 */
3973 buffer->user_page->data_head = head;
3974
3975 /*
3976 * Now check if we missed an update, rely on the (compiler)
3977 * barrier in atomic_dec_and_test() to re-read buffer->head.
3978 */
3979 if (unlikely(head != local_read(&buffer->head))) {
3980 local_inc(&buffer->nest);
3981 goto again;
3982 }
3983
3984 if (handle->wakeup != local_read(&buffer->wakeup))
3985 perf_output_wakeup(handle);
3986
3987out:
3988 preempt_enable();
3989}
3990
3991__always_inline void perf_output_copy(struct perf_output_handle *handle,
3992 const void *buf, unsigned int len)
3993{
3994 do {
3995 unsigned long size = min_t(unsigned long, handle->size, len);
3996
3997 memcpy(handle->addr, buf, size);
3998
3999 len -= size;
4000 handle->addr += size;
4001 buf += size;
4002 handle->size -= size;
4003 if (!handle->size) {
4004 struct perf_buffer *buffer = handle->buffer;
4005
4006 handle->page++;
4007 handle->page &= buffer->nr_pages - 1;
4008 handle->addr = buffer->data_pages[handle->page];
4009 handle->size = PAGE_SIZE << page_order(buffer);
4010 }
4011 } while (len);
4012}
4013
4014static void __perf_event_header__init_id(struct perf_event_header *header,
4015 struct perf_sample_data *data,
4016 struct perf_event *event)
4017{
4018 u64 sample_type = event->attr.sample_type;
4019
4020 data->type = sample_type;
4021 header->size += event->id_header_size;
4022
4023 if (sample_type & PERF_SAMPLE_TID) {
4024 /* namespace issues */
4025 data->tid_entry.pid = perf_event_pid(event, current);
4026 data->tid_entry.tid = perf_event_tid(event, current);
4027 }
4028
4029 if (sample_type & PERF_SAMPLE_TIME)
4030 data->time = perf_clock();
4031
4032 if (sample_type & PERF_SAMPLE_ID)
4033 data->id = primary_event_id(event);
4034
4035 if (sample_type & PERF_SAMPLE_STREAM_ID)
4036 data->stream_id = event->id;
4037
4038 if (sample_type & PERF_SAMPLE_CPU) {
4039 data->cpu_entry.cpu = raw_smp_processor_id();
4040 data->cpu_entry.reserved = 0;
4041 }
4042}
4043
4044static void perf_event_header__init_id(struct perf_event_header *header,
4045 struct perf_sample_data *data,
4046 struct perf_event *event)
4047{
4048 if (event->attr.sample_id_all)
4049 __perf_event_header__init_id(header, data, event);
4050}
4051
4052static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4053 struct perf_sample_data *data)
4054{
4055 u64 sample_type = data->type;
4056
4057 if (sample_type & PERF_SAMPLE_TID)
4058 perf_output_put(handle, data->tid_entry);
4059
4060 if (sample_type & PERF_SAMPLE_TIME)
4061 perf_output_put(handle, data->time);
4062
4063 if (sample_type & PERF_SAMPLE_ID)
4064 perf_output_put(handle, data->id);
4065
4066 if (sample_type & PERF_SAMPLE_STREAM_ID)
4067 perf_output_put(handle, data->stream_id);
4068
4069 if (sample_type & PERF_SAMPLE_CPU)
4070 perf_output_put(handle, data->cpu_entry);
4071}
4072
4073static void perf_event__output_id_sample(struct perf_event *event,
4074 struct perf_output_handle *handle,
4075 struct perf_sample_data *sample)
4076{
4077 if (event->attr.sample_id_all)
4078 __perf_event__output_id_sample(handle, sample);
4079}
4080
4081int perf_output_begin(struct perf_output_handle *handle,
4082 struct perf_event *event, unsigned int size,
4083 int nmi, int sample)
4084{
4085 struct perf_buffer *buffer;
4086 unsigned long tail, offset, head;
4087 int have_lost;
4088 struct perf_sample_data sample_data;
4089 struct {
4090 struct perf_event_header header;
4091 u64 id;
4092 u64 lost;
4093 } lost_event;
4094
4095 rcu_read_lock();
4096 /*
4097 * For inherited events we send all the output towards the parent.
4098 */
4099 if (event->parent)
4100 event = event->parent;
4101
4102 buffer = rcu_dereference(event->buffer);
4103 if (!buffer)
4104 goto out;
4105
4106 handle->buffer = buffer;
4107 handle->event = event;
4108 handle->nmi = nmi;
4109 handle->sample = sample;
4110
4111 if (!buffer->nr_pages)
4112 goto out;
4113
4114 have_lost = local_read(&buffer->lost);
4115 if (have_lost) {
4116 lost_event.header.size = sizeof(lost_event);
4117 perf_event_header__init_id(&lost_event.header, &sample_data,
4118 event);
4119 size += lost_event.header.size;
4120 }
4121
4122 perf_output_get_handle(handle);
4123
4124 do {
4125 /*
4126 * Userspace could choose to issue a mb() before updating the
4127 * tail pointer. So that all reads will be completed before the
4128 * write is issued.
4129 */
4130 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4131 smp_rmb();
4132 offset = head = local_read(&buffer->head);
4133 head += size;
4134 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4135 goto fail;
4136 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4137
4138 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4139 local_add(buffer->watermark, &buffer->wakeup);
4140
4141 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4142 handle->page &= buffer->nr_pages - 1;
4143 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4144 handle->addr = buffer->data_pages[handle->page];
4145 handle->addr += handle->size;
4146 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4147
4148 if (have_lost) {
4149 lost_event.header.type = PERF_RECORD_LOST;
4150 lost_event.header.misc = 0;
4151 lost_event.id = event->id;
4152 lost_event.lost = local_xchg(&buffer->lost, 0);
4153
4154 perf_output_put(handle, lost_event);
4155 perf_event__output_id_sample(event, handle, &sample_data);
4156 }
4157
4158 return 0;
4159
4160fail:
4161 local_inc(&buffer->lost);
4162 perf_output_put_handle(handle);
4163out:
4164 rcu_read_unlock();
4165
4166 return -ENOSPC;
4167}
4168
4169void perf_output_end(struct perf_output_handle *handle)
4170{
4171 struct perf_event *event = handle->event;
4172 struct perf_buffer *buffer = handle->buffer;
4173
4174 int wakeup_events = event->attr.wakeup_events;
4175
4176 if (handle->sample && wakeup_events) {
4177 int events = local_inc_return(&buffer->events);
4178 if (events >= wakeup_events) {
4179 local_sub(wakeup_events, &buffer->events);
4180 local_inc(&buffer->wakeup);
4181 }
4182 }
4183
4184 perf_output_put_handle(handle);
4185 rcu_read_unlock();
4186}
4187
4188static void perf_output_read_one(struct perf_output_handle *handle,
4189 struct perf_event *event,
4190 u64 enabled, u64 running)
4191{
4192 u64 read_format = event->attr.read_format;
4193 u64 values[4];
4194 int n = 0;
4195
4196 values[n++] = perf_event_count(event);
4197 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4198 values[n++] = enabled +
4199 atomic64_read(&event->child_total_time_enabled);
4200 }
4201 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4202 values[n++] = running +
4203 atomic64_read(&event->child_total_time_running);
4204 }
4205 if (read_format & PERF_FORMAT_ID)
4206 values[n++] = primary_event_id(event);
4207
4208 perf_output_copy(handle, values, n * sizeof(u64));
4209}
4210
4211/*
4212 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4213 */
4214static void perf_output_read_group(struct perf_output_handle *handle,
4215 struct perf_event *event,
4216 u64 enabled, u64 running)
4217{
4218 struct perf_event *leader = event->group_leader, *sub;
4219 u64 read_format = event->attr.read_format;
4220 u64 values[5];
4221 int n = 0;
4222
4223 values[n++] = 1 + leader->nr_siblings;
4224
4225 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4226 values[n++] = enabled;
4227
4228 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4229 values[n++] = running;
4230
4231 if (leader != event)
4232 leader->pmu->read(leader);
4233
4234 values[n++] = perf_event_count(leader);
4235 if (read_format & PERF_FORMAT_ID)
4236 values[n++] = primary_event_id(leader);
4237
4238 perf_output_copy(handle, values, n * sizeof(u64));
4239
4240 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4241 n = 0;
4242
4243 if (sub != event)
4244 sub->pmu->read(sub);
4245
4246 values[n++] = perf_event_count(sub);
4247 if (read_format & PERF_FORMAT_ID)
4248 values[n++] = primary_event_id(sub);
4249
4250 perf_output_copy(handle, values, n * sizeof(u64));
4251 }
4252}
4253
4254#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4255 PERF_FORMAT_TOTAL_TIME_RUNNING)
4256
4257static void perf_output_read(struct perf_output_handle *handle,
4258 struct perf_event *event)
4259{
4260 u64 enabled = 0, running = 0, now, ctx_time;
4261 u64 read_format = event->attr.read_format;
4262
4263 /*
4264 * compute total_time_enabled, total_time_running
4265 * based on snapshot values taken when the event
4266 * was last scheduled in.
4267 *
4268 * we cannot simply called update_context_time()
4269 * because of locking issue as we are called in
4270 * NMI context
4271 */
4272 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
4273 now = perf_clock();
4274 ctx_time = event->shadow_ctx_time + now;
4275 enabled = ctx_time - event->tstamp_enabled;
4276 running = ctx_time - event->tstamp_running;
4277 }
4278
4279 if (event->attr.read_format & PERF_FORMAT_GROUP)
4280 perf_output_read_group(handle, event, enabled, running);
4281 else
4282 perf_output_read_one(handle, event, enabled, running);
4283}
4284
4285void perf_output_sample(struct perf_output_handle *handle,
4286 struct perf_event_header *header,
4287 struct perf_sample_data *data,
4288 struct perf_event *event)
4289{
4290 u64 sample_type = data->type;
4291
4292 perf_output_put(handle, *header);
4293
4294 if (sample_type & PERF_SAMPLE_IP)
4295 perf_output_put(handle, data->ip);
4296
4297 if (sample_type & PERF_SAMPLE_TID)
4298 perf_output_put(handle, data->tid_entry);
4299
4300 if (sample_type & PERF_SAMPLE_TIME)
4301 perf_output_put(handle, data->time);
4302
4303 if (sample_type & PERF_SAMPLE_ADDR)
4304 perf_output_put(handle, data->addr);
4305
4306 if (sample_type & PERF_SAMPLE_ID)
4307 perf_output_put(handle, data->id);
4308
4309 if (sample_type & PERF_SAMPLE_STREAM_ID)
4310 perf_output_put(handle, data->stream_id);
4311
4312 if (sample_type & PERF_SAMPLE_CPU)
4313 perf_output_put(handle, data->cpu_entry);
4314
4315 if (sample_type & PERF_SAMPLE_PERIOD)
4316 perf_output_put(handle, data->period);
4317
4318 if (sample_type & PERF_SAMPLE_READ)
4319 perf_output_read(handle, event);
4320
4321 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4322 if (data->callchain) {
4323 int size = 1;
4324
4325 if (data->callchain)
4326 size += data->callchain->nr;
4327
4328 size *= sizeof(u64);
4329
4330 perf_output_copy(handle, data->callchain, size);
4331 } else {
4332 u64 nr = 0;
4333 perf_output_put(handle, nr);
4334 }
4335 }
4336
4337 if (sample_type & PERF_SAMPLE_RAW) {
4338 if (data->raw) {
4339 perf_output_put(handle, data->raw->size);
4340 perf_output_copy(handle, data->raw->data,
4341 data->raw->size);
4342 } else {
4343 struct {
4344 u32 size;
4345 u32 data;
4346 } raw = {
4347 .size = sizeof(u32),
4348 .data = 0,
4349 };
4350 perf_output_put(handle, raw);
4351 }
4352 }
4353}
4354
4355void perf_prepare_sample(struct perf_event_header *header,
4356 struct perf_sample_data *data,
4357 struct perf_event *event,
4358 struct pt_regs *regs)
4359{
4360 u64 sample_type = event->attr.sample_type;
4361
4362 header->type = PERF_RECORD_SAMPLE;
4363 header->size = sizeof(*header) + event->header_size;
4364
4365 header->misc = 0;
4366 header->misc |= perf_misc_flags(regs);
4367
4368 __perf_event_header__init_id(header, data, event);
4369
4370 if (sample_type & PERF_SAMPLE_IP)
4371 data->ip = perf_instruction_pointer(regs);
4372
4373 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4374 int size = 1;
4375
4376 data->callchain = perf_callchain(regs);
4377
4378 if (data->callchain)
4379 size += data->callchain->nr;
4380
4381 header->size += size * sizeof(u64);
4382 }
4383
4384 if (sample_type & PERF_SAMPLE_RAW) {
4385 int size = sizeof(u32);
4386
4387 if (data->raw)
4388 size += data->raw->size;
4389 else
4390 size += sizeof(u32);
4391
4392 WARN_ON_ONCE(size & (sizeof(u64)-1));
4393 header->size += size;
4394 }
4395}
4396
4397static void perf_event_output(struct perf_event *event, int nmi,
4398 struct perf_sample_data *data,
4399 struct pt_regs *regs)
4400{
4401 struct perf_output_handle handle;
4402 struct perf_event_header header;
4403
4404 /* protect the callchain buffers */
4405 rcu_read_lock();
4406
4407 perf_prepare_sample(&header, data, event, regs);
4408
4409 if (perf_output_begin(&handle, event, header.size, nmi, 1))
4410 goto exit;
4411
4412 perf_output_sample(&handle, &header, data, event);
4413
4414 perf_output_end(&handle);
4415
4416exit:
4417 rcu_read_unlock();
4418}
4419
4420/*
4421 * read event_id
4422 */
4423
4424struct perf_read_event {
4425 struct perf_event_header header;
4426
4427 u32 pid;
4428 u32 tid;
4429};
4430
4431static void
4432perf_event_read_event(struct perf_event *event,
4433 struct task_struct *task)
4434{
4435 struct perf_output_handle handle;
4436 struct perf_sample_data sample;
4437 struct perf_read_event read_event = {
4438 .header = {
4439 .type = PERF_RECORD_READ,
4440 .misc = 0,
4441 .size = sizeof(read_event) + event->read_size,
4442 },
4443 .pid = perf_event_pid(event, task),
4444 .tid = perf_event_tid(event, task),
4445 };
4446 int ret;
4447
4448 perf_event_header__init_id(&read_event.header, &sample, event);
4449 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
4450 if (ret)
4451 return;
4452
4453 perf_output_put(&handle, read_event);
4454 perf_output_read(&handle, event);
4455 perf_event__output_id_sample(event, &handle, &sample);
4456
4457 perf_output_end(&handle);
4458}
4459
4460/*
4461 * task tracking -- fork/exit
4462 *
4463 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
4464 */
4465
4466struct perf_task_event {
4467 struct task_struct *task;
4468 struct perf_event_context *task_ctx;
4469
4470 struct {
4471 struct perf_event_header header;
4472
4473 u32 pid;
4474 u32 ppid;
4475 u32 tid;
4476 u32 ptid;
4477 u64 time;
4478 } event_id;
4479};
4480
4481static void perf_event_task_output(struct perf_event *event,
4482 struct perf_task_event *task_event)
4483{
4484 struct perf_output_handle handle;
4485 struct perf_sample_data sample;
4486 struct task_struct *task = task_event->task;
4487 int ret, size = task_event->event_id.header.size;
4488
4489 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4490
4491 ret = perf_output_begin(&handle, event,
4492 task_event->event_id.header.size, 0, 0);
4493 if (ret)
4494 goto out;
4495
4496 task_event->event_id.pid = perf_event_pid(event, task);
4497 task_event->event_id.ppid = perf_event_pid(event, current);
4498
4499 task_event->event_id.tid = perf_event_tid(event, task);
4500 task_event->event_id.ptid = perf_event_tid(event, current);
4501
4502 perf_output_put(&handle, task_event->event_id);
4503
4504 perf_event__output_id_sample(event, &handle, &sample);
4505
4506 perf_output_end(&handle);
4507out:
4508 task_event->event_id.header.size = size;
4509}
4510
4511static int perf_event_task_match(struct perf_event *event)
4512{
4513 if (event->state < PERF_EVENT_STATE_INACTIVE)
4514 return 0;
4515
4516 if (!event_filter_match(event))
4517 return 0;
4518
4519 if (event->attr.comm || event->attr.mmap ||
4520 event->attr.mmap_data || event->attr.task)
4521 return 1;
4522
4523 return 0;
4524}
4525
4526static void perf_event_task_ctx(struct perf_event_context *ctx,
4527 struct perf_task_event *task_event)
4528{
4529 struct perf_event *event;
4530
4531 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4532 if (perf_event_task_match(event))
4533 perf_event_task_output(event, task_event);
4534 }
4535}
4536
4537static void perf_event_task_event(struct perf_task_event *task_event)
4538{
4539 struct perf_cpu_context *cpuctx;
4540 struct perf_event_context *ctx;
4541 struct pmu *pmu;
4542 int ctxn;
4543
4544 rcu_read_lock();
4545 list_for_each_entry_rcu(pmu, &pmus, entry) {
4546 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4547 if (cpuctx->active_pmu != pmu)
4548 goto next;
4549 perf_event_task_ctx(&cpuctx->ctx, task_event);
4550
4551 ctx = task_event->task_ctx;
4552 if (!ctx) {
4553 ctxn = pmu->task_ctx_nr;
4554 if (ctxn < 0)
4555 goto next;
4556 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4557 }
4558 if (ctx)
4559 perf_event_task_ctx(ctx, task_event);
4560next:
4561 put_cpu_ptr(pmu->pmu_cpu_context);
4562 }
4563 rcu_read_unlock();
4564}
4565
4566static void perf_event_task(struct task_struct *task,
4567 struct perf_event_context *task_ctx,
4568 int new)
4569{
4570 struct perf_task_event task_event;
4571
4572 if (!atomic_read(&nr_comm_events) &&
4573 !atomic_read(&nr_mmap_events) &&
4574 !atomic_read(&nr_task_events))
4575 return;
4576
4577 task_event = (struct perf_task_event){
4578 .task = task,
4579 .task_ctx = task_ctx,
4580 .event_id = {
4581 .header = {
4582 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4583 .misc = 0,
4584 .size = sizeof(task_event.event_id),
4585 },
4586 /* .pid */
4587 /* .ppid */
4588 /* .tid */
4589 /* .ptid */
4590 .time = perf_clock(),
4591 },
4592 };
4593
4594 perf_event_task_event(&task_event);
4595}
4596
4597void perf_event_fork(struct task_struct *task)
4598{
4599 perf_event_task(task, NULL, 1);
4600}
4601
4602/*
4603 * comm tracking
4604 */
4605
4606struct perf_comm_event {
4607 struct task_struct *task;
4608 char *comm;
4609 int comm_size;
4610
4611 struct {
4612 struct perf_event_header header;
4613
4614 u32 pid;
4615 u32 tid;
4616 } event_id;
4617};
4618
4619static void perf_event_comm_output(struct perf_event *event,
4620 struct perf_comm_event *comm_event)
4621{
4622 struct perf_output_handle handle;
4623 struct perf_sample_data sample;
4624 int size = comm_event->event_id.header.size;
4625 int ret;
4626
4627 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4628 ret = perf_output_begin(&handle, event,
4629 comm_event->event_id.header.size, 0, 0);
4630
4631 if (ret)
4632 goto out;
4633
4634 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4635 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4636
4637 perf_output_put(&handle, comm_event->event_id);
4638 perf_output_copy(&handle, comm_event->comm,
4639 comm_event->comm_size);
4640
4641 perf_event__output_id_sample(event, &handle, &sample);
4642
4643 perf_output_end(&handle);
4644out:
4645 comm_event->event_id.header.size = size;
4646}
4647
4648static int perf_event_comm_match(struct perf_event *event)
4649{
4650 if (event->state < PERF_EVENT_STATE_INACTIVE)
4651 return 0;
4652
4653 if (!event_filter_match(event))
4654 return 0;
4655
4656 if (event->attr.comm)
4657 return 1;
4658
4659 return 0;
4660}
4661
4662static void perf_event_comm_ctx(struct perf_event_context *ctx,
4663 struct perf_comm_event *comm_event)
4664{
4665 struct perf_event *event;
4666
4667 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4668 if (perf_event_comm_match(event))
4669 perf_event_comm_output(event, comm_event);
4670 }
4671}
4672
4673static void perf_event_comm_event(struct perf_comm_event *comm_event)
4674{
4675 struct perf_cpu_context *cpuctx;
4676 struct perf_event_context *ctx;
4677 char comm[TASK_COMM_LEN];
4678 unsigned int size;
4679 struct pmu *pmu;
4680 int ctxn;
4681
4682 memset(comm, 0, sizeof(comm));
4683 strlcpy(comm, comm_event->task->comm, sizeof(comm));
4684 size = ALIGN(strlen(comm)+1, sizeof(u64));
4685
4686 comm_event->comm = comm;
4687 comm_event->comm_size = size;
4688
4689 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4690 rcu_read_lock();
4691 list_for_each_entry_rcu(pmu, &pmus, entry) {
4692 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4693 if (cpuctx->active_pmu != pmu)
4694 goto next;
4695 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4696
4697 ctxn = pmu->task_ctx_nr;
4698 if (ctxn < 0)
4699 goto next;
4700
4701 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4702 if (ctx)
4703 perf_event_comm_ctx(ctx, comm_event);
4704next:
4705 put_cpu_ptr(pmu->pmu_cpu_context);
4706 }
4707 rcu_read_unlock();
4708}
4709
4710void perf_event_comm(struct task_struct *task)
4711{
4712 struct perf_comm_event comm_event;
4713 struct perf_event_context *ctx;
4714 int ctxn;
4715
4716 for_each_task_context_nr(ctxn) {
4717 ctx = task->perf_event_ctxp[ctxn];
4718 if (!ctx)
4719 continue;
4720
4721 perf_event_enable_on_exec(ctx);
4722 }
4723
4724 if (!atomic_read(&nr_comm_events))
4725 return;
4726
4727 comm_event = (struct perf_comm_event){
4728 .task = task,
4729 /* .comm */
4730 /* .comm_size */
4731 .event_id = {
4732 .header = {
4733 .type = PERF_RECORD_COMM,
4734 .misc = 0,
4735 /* .size */
4736 },
4737 /* .pid */
4738 /* .tid */
4739 },
4740 };
4741
4742 perf_event_comm_event(&comm_event);
4743}
4744
4745/*
4746 * mmap tracking
4747 */
4748
4749struct perf_mmap_event {
4750 struct vm_area_struct *vma;
4751
4752 const char *file_name;
4753 int file_size;
4754
4755 struct {
4756 struct perf_event_header header;
4757
4758 u32 pid;
4759 u32 tid;
4760 u64 start;
4761 u64 len;
4762 u64 pgoff;
4763 } event_id;
4764};
4765
4766static void perf_event_mmap_output(struct perf_event *event,
4767 struct perf_mmap_event *mmap_event)
4768{
4769 struct perf_output_handle handle;
4770 struct perf_sample_data sample;
4771 int size = mmap_event->event_id.header.size;
4772 int ret;
4773
4774 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4775 ret = perf_output_begin(&handle, event,
4776 mmap_event->event_id.header.size, 0, 0);
4777 if (ret)
4778 goto out;
4779
4780 mmap_event->event_id.pid = perf_event_pid(event, current);
4781 mmap_event->event_id.tid = perf_event_tid(event, current);
4782
4783 perf_output_put(&handle, mmap_event->event_id);
4784 perf_output_copy(&handle, mmap_event->file_name,
4785 mmap_event->file_size);
4786
4787 perf_event__output_id_sample(event, &handle, &sample);
4788
4789 perf_output_end(&handle);
4790out:
4791 mmap_event->event_id.header.size = size;
4792}
4793
4794static int perf_event_mmap_match(struct perf_event *event,
4795 struct perf_mmap_event *mmap_event,
4796 int executable)
4797{
4798 if (event->state < PERF_EVENT_STATE_INACTIVE)
4799 return 0;
4800
4801 if (!event_filter_match(event))
4802 return 0;
4803
4804 if ((!executable && event->attr.mmap_data) ||
4805 (executable && event->attr.mmap))
4806 return 1;
4807
4808 return 0;
4809}
4810
4811static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4812 struct perf_mmap_event *mmap_event,
4813 int executable)
4814{
4815 struct perf_event *event;
4816
4817 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4818 if (perf_event_mmap_match(event, mmap_event, executable))
4819 perf_event_mmap_output(event, mmap_event);
4820 }
4821}
4822
4823static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4824{
4825 struct perf_cpu_context *cpuctx;
4826 struct perf_event_context *ctx;
4827 struct vm_area_struct *vma = mmap_event->vma;
4828 struct file *file = vma->vm_file;
4829 unsigned int size;
4830 char tmp[16];
4831 char *buf = NULL;
4832 const char *name;
4833 struct pmu *pmu;
4834 int ctxn;
4835
4836 memset(tmp, 0, sizeof(tmp));
4837
4838 if (file) {
4839 /*
4840 * d_path works from the end of the buffer backwards, so we
4841 * need to add enough zero bytes after the string to handle
4842 * the 64bit alignment we do later.
4843 */
4844 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4845 if (!buf) {
4846 name = strncpy(tmp, "//enomem", sizeof(tmp));
4847 goto got_name;
4848 }
4849 name = d_path(&file->f_path, buf, PATH_MAX);
4850 if (IS_ERR(name)) {
4851 name = strncpy(tmp, "//toolong", sizeof(tmp));
4852 goto got_name;
4853 }
4854 } else {
4855 if (arch_vma_name(mmap_event->vma)) {
4856 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4857 sizeof(tmp));
4858 goto got_name;
4859 }
4860
4861 if (!vma->vm_mm) {
4862 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4863 goto got_name;
4864 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4865 vma->vm_end >= vma->vm_mm->brk) {
4866 name = strncpy(tmp, "[heap]", sizeof(tmp));
4867 goto got_name;
4868 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4869 vma->vm_end >= vma->vm_mm->start_stack) {
4870 name = strncpy(tmp, "[stack]", sizeof(tmp));
4871 goto got_name;
4872 }
4873
4874 name = strncpy(tmp, "//anon", sizeof(tmp));
4875 goto got_name;
4876 }
4877
4878got_name:
4879 size = ALIGN(strlen(name)+1, sizeof(u64));
4880
4881 mmap_event->file_name = name;
4882 mmap_event->file_size = size;
4883
4884 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4885
4886 rcu_read_lock();
4887 list_for_each_entry_rcu(pmu, &pmus, entry) {
4888 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4889 if (cpuctx->active_pmu != pmu)
4890 goto next;
4891 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4892 vma->vm_flags & VM_EXEC);
4893
4894 ctxn = pmu->task_ctx_nr;
4895 if (ctxn < 0)
4896 goto next;
4897
4898 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4899 if (ctx) {
4900 perf_event_mmap_ctx(ctx, mmap_event,
4901 vma->vm_flags & VM_EXEC);
4902 }
4903next:
4904 put_cpu_ptr(pmu->pmu_cpu_context);
4905 }
4906 rcu_read_unlock();
4907
4908 kfree(buf);
4909}
4910
4911void perf_event_mmap(struct vm_area_struct *vma)
4912{
4913 struct perf_mmap_event mmap_event;
4914
4915 if (!atomic_read(&nr_mmap_events))
4916 return;
4917
4918 mmap_event = (struct perf_mmap_event){
4919 .vma = vma,
4920 /* .file_name */
4921 /* .file_size */
4922 .event_id = {
4923 .header = {
4924 .type = PERF_RECORD_MMAP,
4925 .misc = PERF_RECORD_MISC_USER,
4926 /* .size */
4927 },
4928 /* .pid */
4929 /* .tid */
4930 .start = vma->vm_start,
4931 .len = vma->vm_end - vma->vm_start,
4932 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
4933 },
4934 };
4935
4936 perf_event_mmap_event(&mmap_event);
4937}
4938
4939/*
4940 * IRQ throttle logging
4941 */
4942
4943static void perf_log_throttle(struct perf_event *event, int enable)
4944{
4945 struct perf_output_handle handle;
4946 struct perf_sample_data sample;
4947 int ret;
4948
4949 struct {
4950 struct perf_event_header header;
4951 u64 time;
4952 u64 id;
4953 u64 stream_id;
4954 } throttle_event = {
4955 .header = {
4956 .type = PERF_RECORD_THROTTLE,
4957 .misc = 0,
4958 .size = sizeof(throttle_event),
4959 },
4960 .time = perf_clock(),
4961 .id = primary_event_id(event),
4962 .stream_id = event->id,
4963 };
4964
4965 if (enable)
4966 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4967
4968 perf_event_header__init_id(&throttle_event.header, &sample, event);
4969
4970 ret = perf_output_begin(&handle, event,
4971 throttle_event.header.size, 1, 0);
4972 if (ret)
4973 return;
4974
4975 perf_output_put(&handle, throttle_event);
4976 perf_event__output_id_sample(event, &handle, &sample);
4977 perf_output_end(&handle);
4978}
4979
4980/*
4981 * Generic event overflow handling, sampling.
4982 */
4983
4984static int __perf_event_overflow(struct perf_event *event, int nmi,
4985 int throttle, struct perf_sample_data *data,
4986 struct pt_regs *regs)
4987{
4988 int events = atomic_read(&event->event_limit);
4989 struct hw_perf_event *hwc = &event->hw;
4990 int ret = 0;
4991
4992 /*
4993 * Non-sampling counters might still use the PMI to fold short
4994 * hardware counters, ignore those.
4995 */
4996 if (unlikely(!is_sampling_event(event)))
4997 return 0;
4998
4999 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
5000 if (throttle) {
5001 hwc->interrupts = MAX_INTERRUPTS;
5002 perf_log_throttle(event, 0);
5003 ret = 1;
5004 }
5005 } else
5006 hwc->interrupts++;
5007
5008 if (event->attr.freq) {
5009 u64 now = perf_clock();
5010 s64 delta = now - hwc->freq_time_stamp;
5011
5012 hwc->freq_time_stamp = now;
5013
5014 if (delta > 0 && delta < 2*TICK_NSEC)
5015 perf_adjust_period(event, delta, hwc->last_period);
5016 }
5017
5018 /*
5019 * XXX event_limit might not quite work as expected on inherited
5020 * events
5021 */
5022
5023 event->pending_kill = POLL_IN;
5024 if (events && atomic_dec_and_test(&event->event_limit)) {
5025 ret = 1;
5026 event->pending_kill = POLL_HUP;
5027 if (nmi) {
5028 event->pending_disable = 1;
5029 irq_work_queue(&event->pending);
5030 } else
5031 perf_event_disable(event);
5032 }
5033
5034 if (event->overflow_handler)
5035 event->overflow_handler(event, nmi, data, regs);
5036 else
5037 perf_event_output(event, nmi, data, regs);
5038
5039 return ret;
5040}
5041
5042int perf_event_overflow(struct perf_event *event, int nmi,
5043 struct perf_sample_data *data,
5044 struct pt_regs *regs)
5045{
5046 return __perf_event_overflow(event, nmi, 1, data, regs);
5047}
5048
5049/*
5050 * Generic software event infrastructure
5051 */
5052
5053struct swevent_htable {
5054 struct swevent_hlist *swevent_hlist;
5055 struct mutex hlist_mutex;
5056 int hlist_refcount;
5057
5058 /* Recursion avoidance in each contexts */
5059 int recursion[PERF_NR_CONTEXTS];
5060};
5061
5062static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5063
5064/*
5065 * We directly increment event->count and keep a second value in
5066 * event->hw.period_left to count intervals. This period event
5067 * is kept in the range [-sample_period, 0] so that we can use the
5068 * sign as trigger.
5069 */
5070
5071static u64 perf_swevent_set_period(struct perf_event *event)
5072{
5073 struct hw_perf_event *hwc = &event->hw;
5074 u64 period = hwc->last_period;
5075 u64 nr, offset;
5076 s64 old, val;
5077
5078 hwc->last_period = hwc->sample_period;
5079
5080again:
5081 old = val = local64_read(&hwc->period_left);
5082 if (val < 0)
5083 return 0;
5084
5085 nr = div64_u64(period + val, period);
5086 offset = nr * period;
5087 val -= offset;
5088 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
5089 goto again;
5090
5091 return nr;
5092}
5093
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data,
5096 struct pt_regs *regs)
5097{
5098 struct hw_perf_event *hwc = &event->hw;
5099 int throttle = 0;
5100
5101 data->period = event->hw.last_period;
5102 if (!overflow)
5103 overflow = perf_swevent_set_period(event);
5104
5105 if (hwc->interrupts == MAX_INTERRUPTS)
5106 return;
5107
5108 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle,
5110 data, regs)) {
5111 /*
5112 * We inhibit the overflow from happening when
5113 * hwc->interrupts == MAX_INTERRUPTS.
5114 */
5115 break;
5116 }
5117 throttle = 1;
5118 }
5119}
5120
5121static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data,
5123 struct pt_regs *regs)
5124{
5125 struct hw_perf_event *hwc = &event->hw;
5126
5127 local64_add(nr, &event->count);
5128
5129 if (!regs)
5130 return;
5131
5132 if (!is_sampling_event(event))
5133 return;
5134
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs);
5137
5138 if (local64_add_negative(nr, &hwc->period_left))
5139 return;
5140
5141 perf_swevent_overflow(event, 0, nmi, data, regs);
5142}
5143
5144static int perf_exclude_event(struct perf_event *event,
5145 struct pt_regs *regs)
5146{
5147 if (event->hw.state & PERF_HES_STOPPED)
5148 return 1;
5149
5150 if (regs) {
5151 if (event->attr.exclude_user && user_mode(regs))
5152 return 1;
5153
5154 if (event->attr.exclude_kernel && !user_mode(regs))
5155 return 1;
5156 }
5157
5158 return 0;
5159}
5160
5161static int perf_swevent_match(struct perf_event *event,
5162 enum perf_type_id type,
5163 u32 event_id,
5164 struct perf_sample_data *data,
5165 struct pt_regs *regs)
5166{
5167 if (event->attr.type != type)
5168 return 0;
5169
5170 if (event->attr.config != event_id)
5171 return 0;
5172
5173 if (perf_exclude_event(event, regs))
5174 return 0;
5175
5176 return 1;
5177}
5178
5179static inline u64 swevent_hash(u64 type, u32 event_id)
5180{
5181 u64 val = event_id | (type << 32);
5182
5183 return hash_64(val, SWEVENT_HLIST_BITS);
5184}
5185
5186static inline struct hlist_head *
5187__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
5188{
5189 u64 hash = swevent_hash(type, event_id);
5190
5191 return &hlist->heads[hash];
5192}
5193
5194/* For the read side: events when they trigger */
5195static inline struct hlist_head *
5196find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
5197{
5198 struct swevent_hlist *hlist;
5199
5200 hlist = rcu_dereference(swhash->swevent_hlist);
5201 if (!hlist)
5202 return NULL;
5203
5204 return __find_swevent_head(hlist, type, event_id);
5205}
5206
5207/* For the event head insertion and removal in the hlist */
5208static inline struct hlist_head *
5209find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5210{
5211 struct swevent_hlist *hlist;
5212 u32 event_id = event->attr.config;
5213 u64 type = event->attr.type;
5214
5215 /*
5216 * Event scheduling is always serialized against hlist allocation
5217 * and release. Which makes the protected version suitable here.
5218 * The context lock guarantees that.
5219 */
5220 hlist = rcu_dereference_protected(swhash->swevent_hlist,
5221 lockdep_is_held(&event->ctx->lock));
5222 if (!hlist)
5223 return NULL;
5224
5225 return __find_swevent_head(hlist, type, event_id);
5226}
5227
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi,
5230 struct perf_sample_data *data,
5231 struct pt_regs *regs)
5232{
5233 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5234 struct perf_event *event;
5235 struct hlist_node *node;
5236 struct hlist_head *head;
5237
5238 rcu_read_lock();
5239 head = find_swevent_head_rcu(swhash, type, event_id);
5240 if (!head)
5241 goto end;
5242
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs);
5246 }
5247end:
5248 rcu_read_unlock();
5249}
5250
5251int perf_swevent_get_recursion_context(void)
5252{
5253 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5254
5255 return get_recursion_context(swhash->recursion);
5256}
5257EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5258
5259inline void perf_swevent_put_recursion_context(int rctx)
5260{
5261 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5262
5263 put_recursion_context(swhash->recursion, rctx);
5264}
5265
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5267 struct pt_regs *regs, u64 addr)
5268{
5269 struct perf_sample_data data;
5270 int rctx;
5271
5272 preempt_disable_notrace();
5273 rctx = perf_swevent_get_recursion_context();
5274 if (rctx < 0)
5275 return;
5276
5277 perf_sample_data_init(&data, addr);
5278
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
5280
5281 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace();
5283}
5284
5285static void perf_swevent_read(struct perf_event *event)
5286{
5287}
5288
5289static int perf_swevent_add(struct perf_event *event, int flags)
5290{
5291 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5292 struct hw_perf_event *hwc = &event->hw;
5293 struct hlist_head *head;
5294
5295 if (is_sampling_event(event)) {
5296 hwc->last_period = hwc->sample_period;
5297 perf_swevent_set_period(event);
5298 }
5299
5300 hwc->state = !(flags & PERF_EF_START);
5301
5302 head = find_swevent_head(swhash, event);
5303 if (WARN_ON_ONCE(!head))
5304 return -EINVAL;
5305
5306 hlist_add_head_rcu(&event->hlist_entry, head);
5307
5308 return 0;
5309}
5310
5311static void perf_swevent_del(struct perf_event *event, int flags)
5312{
5313 hlist_del_rcu(&event->hlist_entry);
5314}
5315
5316static void perf_swevent_start(struct perf_event *event, int flags)
5317{
5318 event->hw.state = 0;
5319}
5320
5321static void perf_swevent_stop(struct perf_event *event, int flags)
5322{
5323 event->hw.state = PERF_HES_STOPPED;
5324}
5325
5326/* Deref the hlist from the update side */
5327static inline struct swevent_hlist *
5328swevent_hlist_deref(struct swevent_htable *swhash)
5329{
5330 return rcu_dereference_protected(swhash->swevent_hlist,
5331 lockdep_is_held(&swhash->hlist_mutex));
5332}
5333
5334static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5335{
5336 struct swevent_hlist *hlist;
5337
5338 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5339 kfree(hlist);
5340}
5341
5342static void swevent_hlist_release(struct swevent_htable *swhash)
5343{
5344 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5345
5346 if (!hlist)
5347 return;
5348
5349 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5350 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
5351}
5352
5353static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5354{
5355 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5356
5357 mutex_lock(&swhash->hlist_mutex);
5358
5359 if (!--swhash->hlist_refcount)
5360 swevent_hlist_release(swhash);
5361
5362 mutex_unlock(&swhash->hlist_mutex);
5363}
5364
5365static void swevent_hlist_put(struct perf_event *event)
5366{
5367 int cpu;
5368
5369 if (event->cpu != -1) {
5370 swevent_hlist_put_cpu(event, event->cpu);
5371 return;
5372 }
5373
5374 for_each_possible_cpu(cpu)
5375 swevent_hlist_put_cpu(event, cpu);
5376}
5377
5378static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5379{
5380 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5381 int err = 0;
5382
5383 mutex_lock(&swhash->hlist_mutex);
5384
5385 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5386 struct swevent_hlist *hlist;
5387
5388 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5389 if (!hlist) {
5390 err = -ENOMEM;
5391 goto exit;
5392 }
5393 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5394 }
5395 swhash->hlist_refcount++;
5396exit:
5397 mutex_unlock(&swhash->hlist_mutex);
5398
5399 return err;
5400}
5401
5402static int swevent_hlist_get(struct perf_event *event)
5403{
5404 int err;
5405 int cpu, failed_cpu;
5406
5407 if (event->cpu != -1)
5408 return swevent_hlist_get_cpu(event, event->cpu);
5409
5410 get_online_cpus();
5411 for_each_possible_cpu(cpu) {
5412 err = swevent_hlist_get_cpu(event, cpu);
5413 if (err) {
5414 failed_cpu = cpu;
5415 goto fail;
5416 }
5417 }
5418 put_online_cpus();
5419
5420 return 0;
5421fail:
5422 for_each_possible_cpu(cpu) {
5423 if (cpu == failed_cpu)
5424 break;
5425 swevent_hlist_put_cpu(event, cpu);
5426 }
5427
5428 put_online_cpus();
5429 return err;
5430}
5431
5432struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5433
5434static void sw_perf_event_destroy(struct perf_event *event)
5435{
5436 u64 event_id = event->attr.config;
5437
5438 WARN_ON(event->parent);
5439
5440 jump_label_dec(&perf_swevent_enabled[event_id]);
5441 swevent_hlist_put(event);
5442}
5443
5444static int perf_swevent_init(struct perf_event *event)
5445{
5446 int event_id = event->attr.config;
5447
5448 if (event->attr.type != PERF_TYPE_SOFTWARE)
5449 return -ENOENT;
5450
5451 switch (event_id) {
5452 case PERF_COUNT_SW_CPU_CLOCK:
5453 case PERF_COUNT_SW_TASK_CLOCK:
5454 return -ENOENT;
5455
5456 default:
5457 break;
5458 }
5459
5460 if (event_id >= PERF_COUNT_SW_MAX)
5461 return -ENOENT;
5462
5463 if (!event->parent) {
5464 int err;
5465
5466 err = swevent_hlist_get(event);
5467 if (err)
5468 return err;
5469
5470 jump_label_inc(&perf_swevent_enabled[event_id]);
5471 event->destroy = sw_perf_event_destroy;
5472 }
5473
5474 return 0;
5475}
5476
5477static struct pmu perf_swevent = {
5478 .task_ctx_nr = perf_sw_context,
5479
5480 .event_init = perf_swevent_init,
5481 .add = perf_swevent_add,
5482 .del = perf_swevent_del,
5483 .start = perf_swevent_start,
5484 .stop = perf_swevent_stop,
5485 .read = perf_swevent_read,
5486};
5487
5488#ifdef CONFIG_EVENT_TRACING
5489
5490static int perf_tp_filter_match(struct perf_event *event,
5491 struct perf_sample_data *data)
5492{
5493 void *record = data->raw->data;
5494
5495 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5496 return 1;
5497 return 0;
5498}
5499
5500static int perf_tp_event_match(struct perf_event *event,
5501 struct perf_sample_data *data,
5502 struct pt_regs *regs)
5503{
5504 if (event->hw.state & PERF_HES_STOPPED)
5505 return 0;
5506 /*
5507 * All tracepoints are from kernel-space.
5508 */
5509 if (event->attr.exclude_kernel)
5510 return 0;
5511
5512 if (!perf_tp_filter_match(event, data))
5513 return 0;
5514
5515 return 1;
5516}
5517
5518void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5519 struct pt_regs *regs, struct hlist_head *head, int rctx)
5520{
5521 struct perf_sample_data data;
5522 struct perf_event *event;
5523 struct hlist_node *node;
5524
5525 struct perf_raw_record raw = {
5526 .size = entry_size,
5527 .data = record,
5528 };
5529
5530 perf_sample_data_init(&data, addr);
5531 data.raw = &raw;
5532
5533 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5534 if (perf_tp_event_match(event, &data, regs))
5535 perf_swevent_event(event, count, 1, &data, regs);
5536 }
5537
5538 perf_swevent_put_recursion_context(rctx);
5539}
5540EXPORT_SYMBOL_GPL(perf_tp_event);
5541
5542static void tp_perf_event_destroy(struct perf_event *event)
5543{
5544 perf_trace_destroy(event);
5545}
5546
5547static int perf_tp_event_init(struct perf_event *event)
5548{
5549 int err;
5550
5551 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5552 return -ENOENT;
5553
5554 err = perf_trace_init(event);
5555 if (err)
5556 return err;
5557
5558 event->destroy = tp_perf_event_destroy;
5559
5560 return 0;
5561}
5562
5563static struct pmu perf_tracepoint = {
5564 .task_ctx_nr = perf_sw_context,
5565
5566 .event_init = perf_tp_event_init,
5567 .add = perf_trace_add,
5568 .del = perf_trace_del,
5569 .start = perf_swevent_start,
5570 .stop = perf_swevent_stop,
5571 .read = perf_swevent_read,
5572};
5573
5574static inline void perf_tp_register(void)
5575{
5576 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5577}
5578
5579static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5580{
5581 char *filter_str;
5582 int ret;
5583
5584 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5585 return -EINVAL;
5586
5587 filter_str = strndup_user(arg, PAGE_SIZE);
5588 if (IS_ERR(filter_str))
5589 return PTR_ERR(filter_str);
5590
5591 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5592
5593 kfree(filter_str);
5594 return ret;
5595}
5596
5597static void perf_event_free_filter(struct perf_event *event)
5598{
5599 ftrace_profile_free_filter(event);
5600}
5601
5602#else
5603
5604static inline void perf_tp_register(void)
5605{
5606}
5607
5608static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5609{
5610 return -ENOENT;
5611}
5612
5613static void perf_event_free_filter(struct perf_event *event)
5614{
5615}
5616
5617#endif /* CONFIG_EVENT_TRACING */
5618
5619#ifdef CONFIG_HAVE_HW_BREAKPOINT
5620void perf_bp_event(struct perf_event *bp, void *data)
5621{
5622 struct perf_sample_data sample;
5623 struct pt_regs *regs = data;
5624
5625 perf_sample_data_init(&sample, bp->attr.bp_addr);
5626
5627 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5628 perf_swevent_event(bp, 1, 1, &sample, regs);
5629}
5630#endif
5631
5632/*
5633 * hrtimer based swevent callback
5634 */
5635
5636static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5637{
5638 enum hrtimer_restart ret = HRTIMER_RESTART;
5639 struct perf_sample_data data;
5640 struct pt_regs *regs;
5641 struct perf_event *event;
5642 u64 period;
5643
5644 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5645
5646 if (event->state != PERF_EVENT_STATE_ACTIVE)
5647 return HRTIMER_NORESTART;
5648
5649 event->pmu->read(event);
5650
5651 perf_sample_data_init(&data, 0);
5652 data.period = event->hw.last_period;
5653 regs = get_irq_regs();
5654
5655 if (regs && !perf_exclude_event(event, regs)) {
5656 if (!(event->attr.exclude_idle && current->pid == 0))
5657 if (perf_event_overflow(event, 0, &data, regs))
5658 ret = HRTIMER_NORESTART;
5659 }
5660
5661 period = max_t(u64, 10000, event->hw.sample_period);
5662 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5663
5664 return ret;
5665}
5666
5667static void perf_swevent_start_hrtimer(struct perf_event *event)
5668{
5669 struct hw_perf_event *hwc = &event->hw;
5670 s64 period;
5671
5672 if (!is_sampling_event(event))
5673 return;
5674
5675 period = local64_read(&hwc->period_left);
5676 if (period) {
5677 if (period < 0)
5678 period = 10000;
5679
5680 local64_set(&hwc->period_left, 0);
5681 } else {
5682 period = max_t(u64, 10000, hwc->sample_period);
5683 }
5684 __hrtimer_start_range_ns(&hwc->hrtimer,
5685 ns_to_ktime(period), 0,
5686 HRTIMER_MODE_REL_PINNED, 0);
5687}
5688
5689static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5690{
5691 struct hw_perf_event *hwc = &event->hw;
5692
5693 if (is_sampling_event(event)) {
5694 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5695 local64_set(&hwc->period_left, ktime_to_ns(remaining));
5696
5697 hrtimer_cancel(&hwc->hrtimer);
5698 }
5699}
5700
5701static void perf_swevent_init_hrtimer(struct perf_event *event)
5702{
5703 struct hw_perf_event *hwc = &event->hw;
5704
5705 if (!is_sampling_event(event))
5706 return;
5707
5708 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5709 hwc->hrtimer.function = perf_swevent_hrtimer;
5710
5711 /*
5712 * Since hrtimers have a fixed rate, we can do a static freq->period
5713 * mapping and avoid the whole period adjust feedback stuff.
5714 */
5715 if (event->attr.freq) {
5716 long freq = event->attr.sample_freq;
5717
5718 event->attr.sample_period = NSEC_PER_SEC / freq;
5719 hwc->sample_period = event->attr.sample_period;
5720 local64_set(&hwc->period_left, hwc->sample_period);
5721 event->attr.freq = 0;
5722 }
5723}
5724
5725/*
5726 * Software event: cpu wall time clock
5727 */
5728
5729static void cpu_clock_event_update(struct perf_event *event)
5730{
5731 s64 prev;
5732 u64 now;
5733
5734 now = local_clock();
5735 prev = local64_xchg(&event->hw.prev_count, now);
5736 local64_add(now - prev, &event->count);
5737}
5738
5739static void cpu_clock_event_start(struct perf_event *event, int flags)
5740{
5741 local64_set(&event->hw.prev_count, local_clock());
5742 perf_swevent_start_hrtimer(event);
5743}
5744
5745static void cpu_clock_event_stop(struct perf_event *event, int flags)
5746{
5747 perf_swevent_cancel_hrtimer(event);
5748 cpu_clock_event_update(event);
5749}
5750
5751static int cpu_clock_event_add(struct perf_event *event, int flags)
5752{
5753 if (flags & PERF_EF_START)
5754 cpu_clock_event_start(event, flags);
5755
5756 return 0;
5757}
5758
5759static void cpu_clock_event_del(struct perf_event *event, int flags)
5760{
5761 cpu_clock_event_stop(event, flags);
5762}
5763
5764static void cpu_clock_event_read(struct perf_event *event)
5765{
5766 cpu_clock_event_update(event);
5767}
5768
5769static int cpu_clock_event_init(struct perf_event *event)
5770{
5771 if (event->attr.type != PERF_TYPE_SOFTWARE)
5772 return -ENOENT;
5773
5774 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5775 return -ENOENT;
5776
5777 perf_swevent_init_hrtimer(event);
5778
5779 return 0;
5780}
5781
5782static struct pmu perf_cpu_clock = {
5783 .task_ctx_nr = perf_sw_context,
5784
5785 .event_init = cpu_clock_event_init,
5786 .add = cpu_clock_event_add,
5787 .del = cpu_clock_event_del,
5788 .start = cpu_clock_event_start,
5789 .stop = cpu_clock_event_stop,
5790 .read = cpu_clock_event_read,
5791};
5792
5793/*
5794 * Software event: task time clock
5795 */
5796
5797static void task_clock_event_update(struct perf_event *event, u64 now)
5798{
5799 u64 prev;
5800 s64 delta;
5801
5802 prev = local64_xchg(&event->hw.prev_count, now);
5803 delta = now - prev;
5804 local64_add(delta, &event->count);
5805}
5806
5807static void task_clock_event_start(struct perf_event *event, int flags)
5808{
5809 local64_set(&event->hw.prev_count, event->ctx->time);
5810 perf_swevent_start_hrtimer(event);
5811}
5812
5813static void task_clock_event_stop(struct perf_event *event, int flags)
5814{
5815 perf_swevent_cancel_hrtimer(event);
5816 task_clock_event_update(event, event->ctx->time);
5817}
5818
5819static int task_clock_event_add(struct perf_event *event, int flags)
5820{
5821 if (flags & PERF_EF_START)
5822 task_clock_event_start(event, flags);
5823
5824 return 0;
5825}
5826
5827static void task_clock_event_del(struct perf_event *event, int flags)
5828{
5829 task_clock_event_stop(event, PERF_EF_UPDATE);
5830}
5831
5832static void task_clock_event_read(struct perf_event *event)
5833{
5834 u64 now = perf_clock();
5835 u64 delta = now - event->ctx->timestamp;
5836 u64 time = event->ctx->time + delta;
5837
5838 task_clock_event_update(event, time);
5839}
5840
5841static int task_clock_event_init(struct perf_event *event)
5842{
5843 if (event->attr.type != PERF_TYPE_SOFTWARE)
5844 return -ENOENT;
5845
5846 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5847 return -ENOENT;
5848
5849 perf_swevent_init_hrtimer(event);
5850
5851 return 0;
5852}
5853
5854static struct pmu perf_task_clock = {
5855 .task_ctx_nr = perf_sw_context,
5856
5857 .event_init = task_clock_event_init,
5858 .add = task_clock_event_add,
5859 .del = task_clock_event_del,
5860 .start = task_clock_event_start,
5861 .stop = task_clock_event_stop,
5862 .read = task_clock_event_read,
5863};
5864
5865static void perf_pmu_nop_void(struct pmu *pmu)
5866{
5867}
5868
5869static int perf_pmu_nop_int(struct pmu *pmu)
5870{
5871 return 0;
5872}
5873
5874static void perf_pmu_start_txn(struct pmu *pmu)
5875{
5876 perf_pmu_disable(pmu);
5877}
5878
5879static int perf_pmu_commit_txn(struct pmu *pmu)
5880{
5881 perf_pmu_enable(pmu);
5882 return 0;
5883}
5884
5885static void perf_pmu_cancel_txn(struct pmu *pmu)
5886{
5887 perf_pmu_enable(pmu);
5888}
5889
5890/*
5891 * Ensures all contexts with the same task_ctx_nr have the same
5892 * pmu_cpu_context too.
5893 */
5894static void *find_pmu_context(int ctxn)
5895{
5896 struct pmu *pmu;
5897
5898 if (ctxn < 0)
5899 return NULL;
5900
5901 list_for_each_entry(pmu, &pmus, entry) {
5902 if (pmu->task_ctx_nr == ctxn)
5903 return pmu->pmu_cpu_context;
5904 }
5905
5906 return NULL;
5907}
5908
5909static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5910{
5911 int cpu;
5912
5913 for_each_possible_cpu(cpu) {
5914 struct perf_cpu_context *cpuctx;
5915
5916 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5917
5918 if (cpuctx->active_pmu == old_pmu)
5919 cpuctx->active_pmu = pmu;
5920 }
5921}
5922
5923static void free_pmu_context(struct pmu *pmu)
5924{
5925 struct pmu *i;
5926
5927 mutex_lock(&pmus_lock);
5928 /*
5929 * Like a real lame refcount.
5930 */
5931 list_for_each_entry(i, &pmus, entry) {
5932 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5933 update_pmu_context(i, pmu);
5934 goto out;
5935 }
5936 }
5937
5938 free_percpu(pmu->pmu_cpu_context);
5939out:
5940 mutex_unlock(&pmus_lock);
5941}
5942static struct idr pmu_idr;
5943
5944static ssize_t
5945type_show(struct device *dev, struct device_attribute *attr, char *page)
5946{
5947 struct pmu *pmu = dev_get_drvdata(dev);
5948
5949 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5950}
5951
5952static struct device_attribute pmu_dev_attrs[] = {
5953 __ATTR_RO(type),
5954 __ATTR_NULL,
5955};
5956
5957static int pmu_bus_running;
5958static struct bus_type pmu_bus = {
5959 .name = "event_source",
5960 .dev_attrs = pmu_dev_attrs,
5961};
5962
5963static void pmu_dev_release(struct device *dev)
5964{
5965 kfree(dev);
5966}
5967
5968static int pmu_dev_alloc(struct pmu *pmu)
5969{
5970 int ret = -ENOMEM;
5971
5972 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5973 if (!pmu->dev)
5974 goto out;
5975
5976 device_initialize(pmu->dev);
5977 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5978 if (ret)
5979 goto free_dev;
5980
5981 dev_set_drvdata(pmu->dev, pmu);
5982 pmu->dev->bus = &pmu_bus;
5983 pmu->dev->release = pmu_dev_release;
5984 ret = device_add(pmu->dev);
5985 if (ret)
5986 goto free_dev;
5987
5988out:
5989 return ret;
5990
5991free_dev:
5992 put_device(pmu->dev);
5993 goto out;
5994}
5995
5996static struct lock_class_key cpuctx_mutex;
5997
5998int perf_pmu_register(struct pmu *pmu, char *name, int type)
5999{
6000 int cpu, ret;
6001
6002 mutex_lock(&pmus_lock);
6003 ret = -ENOMEM;
6004 pmu->pmu_disable_count = alloc_percpu(int);
6005 if (!pmu->pmu_disable_count)
6006 goto unlock;
6007
6008 pmu->type = -1;
6009 if (!name)
6010 goto skip_type;
6011 pmu->name = name;
6012
6013 if (type < 0) {
6014 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
6015 if (!err)
6016 goto free_pdc;
6017
6018 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
6019 if (err) {
6020 ret = err;
6021 goto free_pdc;
6022 }
6023 }
6024 pmu->type = type;
6025
6026 if (pmu_bus_running) {
6027 ret = pmu_dev_alloc(pmu);
6028 if (ret)
6029 goto free_idr;
6030 }
6031
6032skip_type:
6033 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6034 if (pmu->pmu_cpu_context)
6035 goto got_cpu_context;
6036
6037 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6038 if (!pmu->pmu_cpu_context)
6039 goto free_dev;
6040
6041 for_each_possible_cpu(cpu) {
6042 struct perf_cpu_context *cpuctx;
6043
6044 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6045 __perf_event_init_context(&cpuctx->ctx);
6046 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6047 cpuctx->ctx.type = cpu_context;
6048 cpuctx->ctx.pmu = pmu;
6049 cpuctx->jiffies_interval = 1;
6050 INIT_LIST_HEAD(&cpuctx->rotation_list);
6051 cpuctx->active_pmu = pmu;
6052 }
6053
6054got_cpu_context:
6055 if (!pmu->start_txn) {
6056 if (pmu->pmu_enable) {
6057 /*
6058 * If we have pmu_enable/pmu_disable calls, install
6059 * transaction stubs that use that to try and batch
6060 * hardware accesses.
6061 */
6062 pmu->start_txn = perf_pmu_start_txn;
6063 pmu->commit_txn = perf_pmu_commit_txn;
6064 pmu->cancel_txn = perf_pmu_cancel_txn;
6065 } else {
6066 pmu->start_txn = perf_pmu_nop_void;
6067 pmu->commit_txn = perf_pmu_nop_int;
6068 pmu->cancel_txn = perf_pmu_nop_void;
6069 }
6070 }
6071
6072 if (!pmu->pmu_enable) {
6073 pmu->pmu_enable = perf_pmu_nop_void;
6074 pmu->pmu_disable = perf_pmu_nop_void;
6075 }
6076
6077 list_add_rcu(&pmu->entry, &pmus);
6078 ret = 0;
6079unlock:
6080 mutex_unlock(&pmus_lock);
6081
6082 return ret;
6083
6084free_dev:
6085 device_del(pmu->dev);
6086 put_device(pmu->dev);
6087
6088free_idr:
6089 if (pmu->type >= PERF_TYPE_MAX)
6090 idr_remove(&pmu_idr, pmu->type);
6091
6092free_pdc:
6093 free_percpu(pmu->pmu_disable_count);
6094 goto unlock;
6095}
6096
6097void perf_pmu_unregister(struct pmu *pmu)
6098{
6099 mutex_lock(&pmus_lock);
6100 list_del_rcu(&pmu->entry);
6101 mutex_unlock(&pmus_lock);
6102
6103 /*
6104 * We dereference the pmu list under both SRCU and regular RCU, so
6105 * synchronize against both of those.
6106 */
6107 synchronize_srcu(&pmus_srcu);
6108 synchronize_rcu();
6109
6110 free_percpu(pmu->pmu_disable_count);
6111 if (pmu->type >= PERF_TYPE_MAX)
6112 idr_remove(&pmu_idr, pmu->type);
6113 device_del(pmu->dev);
6114 put_device(pmu->dev);
6115 free_pmu_context(pmu);
6116}
6117
6118struct pmu *perf_init_event(struct perf_event *event)
6119{
6120 struct pmu *pmu = NULL;
6121 int idx;
6122 int ret;
6123
6124 idx = srcu_read_lock(&pmus_srcu);
6125
6126 rcu_read_lock();
6127 pmu = idr_find(&pmu_idr, event->attr.type);
6128 rcu_read_unlock();
6129 if (pmu) {
6130 ret = pmu->event_init(event);
6131 if (ret)
6132 pmu = ERR_PTR(ret);
6133 goto unlock;
6134 }
6135
6136 list_for_each_entry_rcu(pmu, &pmus, entry) {
6137 ret = pmu->event_init(event);
6138 if (!ret)
6139 goto unlock;
6140
6141 if (ret != -ENOENT) {
6142 pmu = ERR_PTR(ret);
6143 goto unlock;
6144 }
6145 }
6146 pmu = ERR_PTR(-ENOENT);
6147unlock:
6148 srcu_read_unlock(&pmus_srcu, idx);
6149
6150 return pmu;
6151}
6152
6153/*
6154 * Allocate and initialize a event structure
6155 */
6156static struct perf_event *
6157perf_event_alloc(struct perf_event_attr *attr, int cpu,
6158 struct task_struct *task,
6159 struct perf_event *group_leader,
6160 struct perf_event *parent_event,
6161 perf_overflow_handler_t overflow_handler)
6162{
6163 struct pmu *pmu;
6164 struct perf_event *event;
6165 struct hw_perf_event *hwc;
6166 long err;
6167
6168 if ((unsigned)cpu >= nr_cpu_ids) {
6169 if (!task || cpu != -1)
6170 return ERR_PTR(-EINVAL);
6171 }
6172
6173 event = kzalloc(sizeof(*event), GFP_KERNEL);
6174 if (!event)
6175 return ERR_PTR(-ENOMEM);
6176
6177 /*
6178 * Single events are their own group leaders, with an
6179 * empty sibling list:
6180 */
6181 if (!group_leader)
6182 group_leader = event;
6183
6184 mutex_init(&event->child_mutex);
6185 INIT_LIST_HEAD(&event->child_list);
6186
6187 INIT_LIST_HEAD(&event->group_entry);
6188 INIT_LIST_HEAD(&event->event_entry);
6189 INIT_LIST_HEAD(&event->sibling_list);
6190 init_waitqueue_head(&event->waitq);
6191 init_irq_work(&event->pending, perf_pending_event);
6192
6193 mutex_init(&event->mmap_mutex);
6194
6195 event->cpu = cpu;
6196 event->attr = *attr;
6197 event->group_leader = group_leader;
6198 event->pmu = NULL;
6199 event->oncpu = -1;
6200
6201 event->parent = parent_event;
6202
6203 event->ns = get_pid_ns(current->nsproxy->pid_ns);
6204 event->id = atomic64_inc_return(&perf_event_id);
6205
6206 event->state = PERF_EVENT_STATE_INACTIVE;
6207
6208 if (task) {
6209 event->attach_state = PERF_ATTACH_TASK;
6210#ifdef CONFIG_HAVE_HW_BREAKPOINT
6211 /*
6212 * hw_breakpoint is a bit difficult here..
6213 */
6214 if (attr->type == PERF_TYPE_BREAKPOINT)
6215 event->hw.bp_target = task;
6216#endif
6217 }
6218
6219 if (!overflow_handler && parent_event)
6220 overflow_handler = parent_event->overflow_handler;
6221
6222 event->overflow_handler = overflow_handler;
6223
6224 if (attr->disabled)
6225 event->state = PERF_EVENT_STATE_OFF;
6226
6227 pmu = NULL;
6228
6229 hwc = &event->hw;
6230 hwc->sample_period = attr->sample_period;
6231 if (attr->freq && attr->sample_freq)
6232 hwc->sample_period = 1;
6233 hwc->last_period = hwc->sample_period;
6234
6235 local64_set(&hwc->period_left, hwc->sample_period);
6236
6237 /*
6238 * we currently do not support PERF_FORMAT_GROUP on inherited events
6239 */
6240 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6241 goto done;
6242
6243 pmu = perf_init_event(event);
6244
6245done:
6246 err = 0;
6247 if (!pmu)
6248 err = -EINVAL;
6249 else if (IS_ERR(pmu))
6250 err = PTR_ERR(pmu);
6251
6252 if (err) {
6253 if (event->ns)
6254 put_pid_ns(event->ns);
6255 kfree(event);
6256 return ERR_PTR(err);
6257 }
6258
6259 event->pmu = pmu;
6260
6261 if (!event->parent) {
6262 if (event->attach_state & PERF_ATTACH_TASK)
6263 jump_label_inc(&perf_sched_events);
6264 if (event->attr.mmap || event->attr.mmap_data)
6265 atomic_inc(&nr_mmap_events);
6266 if (event->attr.comm)
6267 atomic_inc(&nr_comm_events);
6268 if (event->attr.task)
6269 atomic_inc(&nr_task_events);
6270 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6271 err = get_callchain_buffers();
6272 if (err) {
6273 free_event(event);
6274 return ERR_PTR(err);
6275 }
6276 }
6277 }
6278
6279 return event;
6280}
6281
6282static int perf_copy_attr(struct perf_event_attr __user *uattr,
6283 struct perf_event_attr *attr)
6284{
6285 u32 size;
6286 int ret;
6287
6288 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6289 return -EFAULT;
6290
6291 /*
6292 * zero the full structure, so that a short copy will be nice.
6293 */
6294 memset(attr, 0, sizeof(*attr));
6295
6296 ret = get_user(size, &uattr->size);
6297 if (ret)
6298 return ret;
6299
6300 if (size > PAGE_SIZE) /* silly large */
6301 goto err_size;
6302
6303 if (!size) /* abi compat */
6304 size = PERF_ATTR_SIZE_VER0;
6305
6306 if (size < PERF_ATTR_SIZE_VER0)
6307 goto err_size;
6308
6309 /*
6310 * If we're handed a bigger struct than we know of,
6311 * ensure all the unknown bits are 0 - i.e. new
6312 * user-space does not rely on any kernel feature
6313 * extensions we dont know about yet.
6314 */
6315 if (size > sizeof(*attr)) {
6316 unsigned char __user *addr;
6317 unsigned char __user *end;
6318 unsigned char val;
6319
6320 addr = (void __user *)uattr + sizeof(*attr);
6321 end = (void __user *)uattr + size;
6322
6323 for (; addr < end; addr++) {
6324 ret = get_user(val, addr);
6325 if (ret)
6326 return ret;
6327 if (val)
6328 goto err_size;
6329 }
6330 size = sizeof(*attr);
6331 }
6332
6333 ret = copy_from_user(attr, uattr, size);
6334 if (ret)
6335 return -EFAULT;
6336
6337 /*
6338 * If the type exists, the corresponding creation will verify
6339 * the attr->config.
6340 */
6341 if (attr->type >= PERF_TYPE_MAX)
6342 return -EINVAL;
6343
6344 if (attr->__reserved_1)
6345 return -EINVAL;
6346
6347 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6348 return -EINVAL;
6349
6350 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6351 return -EINVAL;
6352
6353out:
6354 return ret;
6355
6356err_size:
6357 put_user(sizeof(*attr), &uattr->size);
6358 ret = -E2BIG;
6359 goto out;
6360}
6361
6362static int
6363perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6364{
6365 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
6366 int ret = -EINVAL;
6367
6368 if (!output_event)
6369 goto set;
6370
6371 /* don't allow circular references */
6372 if (event == output_event)
6373 goto out;
6374
6375 /*
6376 * Don't allow cross-cpu buffers
6377 */
6378 if (output_event->cpu != event->cpu)
6379 goto out;
6380
6381 /*
6382 * If its not a per-cpu buffer, it must be the same task.
6383 */
6384 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6385 goto out;
6386
6387set:
6388 mutex_lock(&event->mmap_mutex);
6389 /* Can't redirect output if we've got an active mmap() */
6390 if (atomic_read(&event->mmap_count))
6391 goto unlock;
6392
6393 if (output_event) {
6394 /* get the buffer we want to redirect to */
6395 buffer = perf_buffer_get(output_event);
6396 if (!buffer)
6397 goto unlock;
6398 }
6399
6400 old_buffer = event->buffer;
6401 rcu_assign_pointer(event->buffer, buffer);
6402 ret = 0;
6403unlock:
6404 mutex_unlock(&event->mmap_mutex);
6405
6406 if (old_buffer)
6407 perf_buffer_put(old_buffer);
6408out:
6409 return ret;
6410}
6411
6412/**
6413 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6414 *
6415 * @attr_uptr: event_id type attributes for monitoring/sampling
6416 * @pid: target pid
6417 * @cpu: target cpu
6418 * @group_fd: group leader event fd
6419 */
6420SYSCALL_DEFINE5(perf_event_open,
6421 struct perf_event_attr __user *, attr_uptr,
6422 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6423{
6424 struct perf_event *group_leader = NULL, *output_event = NULL;
6425 struct perf_event *event, *sibling;
6426 struct perf_event_attr attr;
6427 struct perf_event_context *ctx;
6428 struct file *event_file = NULL;
6429 struct file *group_file = NULL;
6430 struct task_struct *task = NULL;
6431 struct pmu *pmu;
6432 int event_fd;
6433 int move_group = 0;
6434 int fput_needed = 0;
6435 int err;
6436
6437 /* for future expandability... */
6438 if (flags & ~PERF_FLAG_ALL)
6439 return -EINVAL;
6440
6441 err = perf_copy_attr(attr_uptr, &attr);
6442 if (err)
6443 return err;
6444
6445 if (!attr.exclude_kernel) {
6446 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6447 return -EACCES;
6448 }
6449
6450 if (attr.freq) {
6451 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6452 return -EINVAL;
6453 }
6454
6455 /*
6456 * In cgroup mode, the pid argument is used to pass the fd
6457 * opened to the cgroup directory in cgroupfs. The cpu argument
6458 * designates the cpu on which to monitor threads from that
6459 * cgroup.
6460 */
6461 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6462 return -EINVAL;
6463
6464 event_fd = get_unused_fd_flags(O_RDWR);
6465 if (event_fd < 0)
6466 return event_fd;
6467
6468 if (group_fd != -1) {
6469 group_leader = perf_fget_light(group_fd, &fput_needed);
6470 if (IS_ERR(group_leader)) {
6471 err = PTR_ERR(group_leader);
6472 goto err_fd;
6473 }
6474 group_file = group_leader->filp;
6475 if (flags & PERF_FLAG_FD_OUTPUT)
6476 output_event = group_leader;
6477 if (flags & PERF_FLAG_FD_NO_GROUP)
6478 group_leader = NULL;
6479 }
6480
6481 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
6482 task = find_lively_task_by_vpid(pid);
6483 if (IS_ERR(task)) {
6484 err = PTR_ERR(task);
6485 goto err_group_fd;
6486 }
6487 }
6488
6489 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
6490 if (IS_ERR(event)) {
6491 err = PTR_ERR(event);
6492 goto err_task;
6493 }
6494
6495 if (flags & PERF_FLAG_PID_CGROUP) {
6496 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6497 if (err)
6498 goto err_alloc;
6499 /*
6500 * one more event:
6501 * - that has cgroup constraint on event->cpu
6502 * - that may need work on context switch
6503 */
6504 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6505 jump_label_inc(&perf_sched_events);
6506 }
6507
6508 /*
6509 * Special case software events and allow them to be part of
6510 * any hardware group.
6511 */
6512 pmu = event->pmu;
6513
6514 if (group_leader &&
6515 (is_software_event(event) != is_software_event(group_leader))) {
6516 if (is_software_event(event)) {
6517 /*
6518 * If event and group_leader are not both a software
6519 * event, and event is, then group leader is not.
6520 *
6521 * Allow the addition of software events to !software
6522 * groups, this is safe because software events never
6523 * fail to schedule.
6524 */
6525 pmu = group_leader->pmu;
6526 } else if (is_software_event(group_leader) &&
6527 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6528 /*
6529 * In case the group is a pure software group, and we
6530 * try to add a hardware event, move the whole group to
6531 * the hardware context.
6532 */
6533 move_group = 1;
6534 }
6535 }
6536
6537 /*
6538 * Get the target context (task or percpu):
6539 */
6540 ctx = find_get_context(pmu, task, cpu);
6541 if (IS_ERR(ctx)) {
6542 err = PTR_ERR(ctx);
6543 goto err_alloc;
6544 }
6545
6546 if (task) {
6547 put_task_struct(task);
6548 task = NULL;
6549 }
6550
6551 /*
6552 * Look up the group leader (we will attach this event to it):
6553 */
6554 if (group_leader) {
6555 err = -EINVAL;
6556
6557 /*
6558 * Do not allow a recursive hierarchy (this new sibling
6559 * becoming part of another group-sibling):
6560 */
6561 if (group_leader->group_leader != group_leader)
6562 goto err_context;
6563 /*
6564 * Do not allow to attach to a group in a different
6565 * task or CPU context:
6566 */
6567 if (move_group) {
6568 if (group_leader->ctx->type != ctx->type)
6569 goto err_context;
6570 } else {
6571 if (group_leader->ctx != ctx)
6572 goto err_context;
6573 }
6574
6575 /*
6576 * Only a group leader can be exclusive or pinned
6577 */
6578 if (attr.exclusive || attr.pinned)
6579 goto err_context;
6580 }
6581
6582 if (output_event) {
6583 err = perf_event_set_output(event, output_event);
6584 if (err)
6585 goto err_context;
6586 }
6587
6588 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6589 if (IS_ERR(event_file)) {
6590 err = PTR_ERR(event_file);
6591 goto err_context;
6592 }
6593
6594 if (move_group) {
6595 struct perf_event_context *gctx = group_leader->ctx;
6596
6597 mutex_lock(&gctx->mutex);
6598 perf_remove_from_context(group_leader);
6599 list_for_each_entry(sibling, &group_leader->sibling_list,
6600 group_entry) {
6601 perf_remove_from_context(sibling);
6602 put_ctx(gctx);
6603 }
6604 mutex_unlock(&gctx->mutex);
6605 put_ctx(gctx);
6606 }
6607
6608 event->filp = event_file;
6609 WARN_ON_ONCE(ctx->parent_ctx);
6610 mutex_lock(&ctx->mutex);
6611
6612 if (move_group) {
6613 perf_install_in_context(ctx, group_leader, cpu);
6614 get_ctx(ctx);
6615 list_for_each_entry(sibling, &group_leader->sibling_list,
6616 group_entry) {
6617 perf_install_in_context(ctx, sibling, cpu);
6618 get_ctx(ctx);
6619 }
6620 }
6621
6622 perf_install_in_context(ctx, event, cpu);
6623 ++ctx->generation;
6624 perf_unpin_context(ctx);
6625 mutex_unlock(&ctx->mutex);
6626
6627 event->owner = current;
6628
6629 mutex_lock(&current->perf_event_mutex);
6630 list_add_tail(&event->owner_entry, &current->perf_event_list);
6631 mutex_unlock(&current->perf_event_mutex);
6632
6633 /*
6634 * Precalculate sample_data sizes
6635 */
6636 perf_event__header_size(event);
6637 perf_event__id_header_size(event);
6638
6639 /*
6640 * Drop the reference on the group_event after placing the
6641 * new event on the sibling_list. This ensures destruction
6642 * of the group leader will find the pointer to itself in
6643 * perf_group_detach().
6644 */
6645 fput_light(group_file, fput_needed);
6646 fd_install(event_fd, event_file);
6647 return event_fd;
6648
6649err_context:
6650 perf_unpin_context(ctx);
6651 put_ctx(ctx);
6652err_alloc:
6653 free_event(event);
6654err_task:
6655 if (task)
6656 put_task_struct(task);
6657err_group_fd:
6658 fput_light(group_file, fput_needed);
6659err_fd:
6660 put_unused_fd(event_fd);
6661 return err;
6662}
6663
6664/**
6665 * perf_event_create_kernel_counter
6666 *
6667 * @attr: attributes of the counter to create
6668 * @cpu: cpu in which the counter is bound
6669 * @task: task to profile (NULL for percpu)
6670 */
6671struct perf_event *
6672perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 struct task_struct *task,
6674 perf_overflow_handler_t overflow_handler)
6675{
6676 struct perf_event_context *ctx;
6677 struct perf_event *event;
6678 int err;
6679
6680 /*
6681 * Get the target context (task or percpu):
6682 */
6683
6684 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
6685 if (IS_ERR(event)) {
6686 err = PTR_ERR(event);
6687 goto err;
6688 }
6689
6690 ctx = find_get_context(event->pmu, task, cpu);
6691 if (IS_ERR(ctx)) {
6692 err = PTR_ERR(ctx);
6693 goto err_free;
6694 }
6695
6696 event->filp = NULL;
6697 WARN_ON_ONCE(ctx->parent_ctx);
6698 mutex_lock(&ctx->mutex);
6699 perf_install_in_context(ctx, event, cpu);
6700 ++ctx->generation;
6701 perf_unpin_context(ctx);
6702 mutex_unlock(&ctx->mutex);
6703
6704 return event;
6705
6706err_free:
6707 free_event(event);
6708err:
6709 return ERR_PTR(err);
6710}
6711EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6712
6713static void sync_child_event(struct perf_event *child_event,
6714 struct task_struct *child)
6715{
6716 struct perf_event *parent_event = child_event->parent;
6717 u64 child_val;
6718
6719 if (child_event->attr.inherit_stat)
6720 perf_event_read_event(child_event, child);
6721
6722 child_val = perf_event_count(child_event);
6723
6724 /*
6725 * Add back the child's count to the parent's count:
6726 */
6727 atomic64_add(child_val, &parent_event->child_count);
6728 atomic64_add(child_event->total_time_enabled,
6729 &parent_event->child_total_time_enabled);
6730 atomic64_add(child_event->total_time_running,
6731 &parent_event->child_total_time_running);
6732
6733 /*
6734 * Remove this event from the parent's list
6735 */
6736 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6737 mutex_lock(&parent_event->child_mutex);
6738 list_del_init(&child_event->child_list);
6739 mutex_unlock(&parent_event->child_mutex);
6740
6741 /*
6742 * Release the parent event, if this was the last
6743 * reference to it.
6744 */
6745 fput(parent_event->filp);
6746}
6747
6748static void
6749__perf_event_exit_task(struct perf_event *child_event,
6750 struct perf_event_context *child_ctx,
6751 struct task_struct *child)
6752{
6753 if (child_event->parent) {
6754 raw_spin_lock_irq(&child_ctx->lock);
6755 perf_group_detach(child_event);
6756 raw_spin_unlock_irq(&child_ctx->lock);
6757 }
6758
6759 perf_remove_from_context(child_event);
6760
6761 /*
6762 * It can happen that the parent exits first, and has events
6763 * that are still around due to the child reference. These
6764 * events need to be zapped.
6765 */
6766 if (child_event->parent) {
6767 sync_child_event(child_event, child);
6768 free_event(child_event);
6769 }
6770}
6771
6772static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6773{
6774 struct perf_event *child_event, *tmp;
6775 struct perf_event_context *child_ctx;
6776 unsigned long flags;
6777
6778 if (likely(!child->perf_event_ctxp[ctxn])) {
6779 perf_event_task(child, NULL, 0);
6780 return;
6781 }
6782
6783 local_irq_save(flags);
6784 /*
6785 * We can't reschedule here because interrupts are disabled,
6786 * and either child is current or it is a task that can't be
6787 * scheduled, so we are now safe from rescheduling changing
6788 * our context.
6789 */
6790 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6791 task_ctx_sched_out(child_ctx, EVENT_ALL);
6792
6793 /*
6794 * Take the context lock here so that if find_get_context is
6795 * reading child->perf_event_ctxp, we wait until it has
6796 * incremented the context's refcount before we do put_ctx below.
6797 */
6798 raw_spin_lock(&child_ctx->lock);
6799 child->perf_event_ctxp[ctxn] = NULL;
6800 /*
6801 * If this context is a clone; unclone it so it can't get
6802 * swapped to another process while we're removing all
6803 * the events from it.
6804 */
6805 unclone_ctx(child_ctx);
6806 update_context_time(child_ctx);
6807 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6808
6809 /*
6810 * Report the task dead after unscheduling the events so that we
6811 * won't get any samples after PERF_RECORD_EXIT. We can however still
6812 * get a few PERF_RECORD_READ events.
6813 */
6814 perf_event_task(child, child_ctx, 0);
6815
6816 /*
6817 * We can recurse on the same lock type through:
6818 *
6819 * __perf_event_exit_task()
6820 * sync_child_event()
6821 * fput(parent_event->filp)
6822 * perf_release()
6823 * mutex_lock(&ctx->mutex)
6824 *
6825 * But since its the parent context it won't be the same instance.
6826 */
6827 mutex_lock(&child_ctx->mutex);
6828
6829again:
6830 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6831 group_entry)
6832 __perf_event_exit_task(child_event, child_ctx, child);
6833
6834 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
6835 group_entry)
6836 __perf_event_exit_task(child_event, child_ctx, child);
6837
6838 /*
6839 * If the last event was a group event, it will have appended all
6840 * its siblings to the list, but we obtained 'tmp' before that which
6841 * will still point to the list head terminating the iteration.
6842 */
6843 if (!list_empty(&child_ctx->pinned_groups) ||
6844 !list_empty(&child_ctx->flexible_groups))
6845 goto again;
6846
6847 mutex_unlock(&child_ctx->mutex);
6848
6849 put_ctx(child_ctx);
6850}
6851
6852/*
6853 * When a child task exits, feed back event values to parent events.
6854 */
6855void perf_event_exit_task(struct task_struct *child)
6856{
6857 struct perf_event *event, *tmp;
6858 int ctxn;
6859
6860 mutex_lock(&child->perf_event_mutex);
6861 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6862 owner_entry) {
6863 list_del_init(&event->owner_entry);
6864
6865 /*
6866 * Ensure the list deletion is visible before we clear
6867 * the owner, closes a race against perf_release() where
6868 * we need to serialize on the owner->perf_event_mutex.
6869 */
6870 smp_wmb();
6871 event->owner = NULL;
6872 }
6873 mutex_unlock(&child->perf_event_mutex);
6874
6875 for_each_task_context_nr(ctxn)
6876 perf_event_exit_task_context(child, ctxn);
6877}
6878
6879static void perf_free_event(struct perf_event *event,
6880 struct perf_event_context *ctx)
6881{
6882 struct perf_event *parent = event->parent;
6883
6884 if (WARN_ON_ONCE(!parent))
6885 return;
6886
6887 mutex_lock(&parent->child_mutex);
6888 list_del_init(&event->child_list);
6889 mutex_unlock(&parent->child_mutex);
6890
6891 fput(parent->filp);
6892
6893 perf_group_detach(event);
6894 list_del_event(event, ctx);
6895 free_event(event);
6896}
6897
6898/*
6899 * free an unexposed, unused context as created by inheritance by
6900 * perf_event_init_task below, used by fork() in case of fail.
6901 */
6902void perf_event_free_task(struct task_struct *task)
6903{
6904 struct perf_event_context *ctx;
6905 struct perf_event *event, *tmp;
6906 int ctxn;
6907
6908 for_each_task_context_nr(ctxn) {
6909 ctx = task->perf_event_ctxp[ctxn];
6910 if (!ctx)
6911 continue;
6912
6913 mutex_lock(&ctx->mutex);
6914again:
6915 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6916 group_entry)
6917 perf_free_event(event, ctx);
6918
6919 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6920 group_entry)
6921 perf_free_event(event, ctx);
6922
6923 if (!list_empty(&ctx->pinned_groups) ||
6924 !list_empty(&ctx->flexible_groups))
6925 goto again;
6926
6927 mutex_unlock(&ctx->mutex);
6928
6929 put_ctx(ctx);
6930 }
6931}
6932
6933void perf_event_delayed_put(struct task_struct *task)
6934{
6935 int ctxn;
6936
6937 for_each_task_context_nr(ctxn)
6938 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6939}
6940
6941/*
6942 * inherit a event from parent task to child task:
6943 */
6944static struct perf_event *
6945inherit_event(struct perf_event *parent_event,
6946 struct task_struct *parent,
6947 struct perf_event_context *parent_ctx,
6948 struct task_struct *child,
6949 struct perf_event *group_leader,
6950 struct perf_event_context *child_ctx)
6951{
6952 struct perf_event *child_event;
6953 unsigned long flags;
6954
6955 /*
6956 * Instead of creating recursive hierarchies of events,
6957 * we link inherited events back to the original parent,
6958 * which has a filp for sure, which we use as the reference
6959 * count:
6960 */
6961 if (parent_event->parent)
6962 parent_event = parent_event->parent;
6963
6964 child_event = perf_event_alloc(&parent_event->attr,
6965 parent_event->cpu,
6966 child,
6967 group_leader, parent_event,
6968 NULL);
6969 if (IS_ERR(child_event))
6970 return child_event;
6971 get_ctx(child_ctx);
6972
6973 /*
6974 * Make the child state follow the state of the parent event,
6975 * not its attr.disabled bit. We hold the parent's mutex,
6976 * so we won't race with perf_event_{en, dis}able_family.
6977 */
6978 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6979 child_event->state = PERF_EVENT_STATE_INACTIVE;
6980 else
6981 child_event->state = PERF_EVENT_STATE_OFF;
6982
6983 if (parent_event->attr.freq) {
6984 u64 sample_period = parent_event->hw.sample_period;
6985 struct hw_perf_event *hwc = &child_event->hw;
6986
6987 hwc->sample_period = sample_period;
6988 hwc->last_period = sample_period;
6989
6990 local64_set(&hwc->period_left, sample_period);
6991 }
6992
6993 child_event->ctx = child_ctx;
6994 child_event->overflow_handler = parent_event->overflow_handler;
6995
6996 /*
6997 * Precalculate sample_data sizes
6998 */
6999 perf_event__header_size(child_event);
7000 perf_event__id_header_size(child_event);
7001
7002 /*
7003 * Link it up in the child's context:
7004 */
7005 raw_spin_lock_irqsave(&child_ctx->lock, flags);
7006 add_event_to_ctx(child_event, child_ctx);
7007 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7008
7009 /*
7010 * Get a reference to the parent filp - we will fput it
7011 * when the child event exits. This is safe to do because
7012 * we are in the parent and we know that the filp still
7013 * exists and has a nonzero count:
7014 */
7015 atomic_long_inc(&parent_event->filp->f_count);
7016
7017 /*
7018 * Link this into the parent event's child list
7019 */
7020 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7021 mutex_lock(&parent_event->child_mutex);
7022 list_add_tail(&child_event->child_list, &parent_event->child_list);
7023 mutex_unlock(&parent_event->child_mutex);
7024
7025 return child_event;
7026}
7027
7028static int inherit_group(struct perf_event *parent_event,
7029 struct task_struct *parent,
7030 struct perf_event_context *parent_ctx,
7031 struct task_struct *child,
7032 struct perf_event_context *child_ctx)
7033{
7034 struct perf_event *leader;
7035 struct perf_event *sub;
7036 struct perf_event *child_ctr;
7037
7038 leader = inherit_event(parent_event, parent, parent_ctx,
7039 child, NULL, child_ctx);
7040 if (IS_ERR(leader))
7041 return PTR_ERR(leader);
7042 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7043 child_ctr = inherit_event(sub, parent, parent_ctx,
7044 child, leader, child_ctx);
7045 if (IS_ERR(child_ctr))
7046 return PTR_ERR(child_ctr);
7047 }
7048 return 0;
7049}
7050
7051static int
7052inherit_task_group(struct perf_event *event, struct task_struct *parent,
7053 struct perf_event_context *parent_ctx,
7054 struct task_struct *child, int ctxn,
7055 int *inherited_all)
7056{
7057 int ret;
7058 struct perf_event_context *child_ctx;
7059
7060 if (!event->attr.inherit) {
7061 *inherited_all = 0;
7062 return 0;
7063 }
7064
7065 child_ctx = child->perf_event_ctxp[ctxn];
7066 if (!child_ctx) {
7067 /*
7068 * This is executed from the parent task context, so
7069 * inherit events that have been marked for cloning.
7070 * First allocate and initialize a context for the
7071 * child.
7072 */
7073
7074 child_ctx = alloc_perf_context(event->pmu, child);
7075 if (!child_ctx)
7076 return -ENOMEM;
7077
7078 child->perf_event_ctxp[ctxn] = child_ctx;
7079 }
7080
7081 ret = inherit_group(event, parent, parent_ctx,
7082 child, child_ctx);
7083
7084 if (ret)
7085 *inherited_all = 0;
7086
7087 return ret;
7088}
7089
7090/*
7091 * Initialize the perf_event context in task_struct
7092 */
7093int perf_event_init_context(struct task_struct *child, int ctxn)
7094{
7095 struct perf_event_context *child_ctx, *parent_ctx;
7096 struct perf_event_context *cloned_ctx;
7097 struct perf_event *event;
7098 struct task_struct *parent = current;
7099 int inherited_all = 1;
7100 unsigned long flags;
7101 int ret = 0;
7102
7103 if (likely(!parent->perf_event_ctxp[ctxn]))
7104 return 0;
7105
7106 /*
7107 * If the parent's context is a clone, pin it so it won't get
7108 * swapped under us.
7109 */
7110 parent_ctx = perf_pin_task_context(parent, ctxn);
7111
7112 /*
7113 * No need to check if parent_ctx != NULL here; since we saw
7114 * it non-NULL earlier, the only reason for it to become NULL
7115 * is if we exit, and since we're currently in the middle of
7116 * a fork we can't be exiting at the same time.
7117 */
7118
7119 /*
7120 * Lock the parent list. No need to lock the child - not PID
7121 * hashed yet and not running, so nobody can access it.
7122 */
7123 mutex_lock(&parent_ctx->mutex);
7124
7125 /*
7126 * We dont have to disable NMIs - we are only looking at
7127 * the list, not manipulating it:
7128 */
7129 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
7130 ret = inherit_task_group(event, parent, parent_ctx,
7131 child, ctxn, &inherited_all);
7132 if (ret)
7133 break;
7134 }
7135
7136 /*
7137 * We can't hold ctx->lock when iterating the ->flexible_group list due
7138 * to allocations, but we need to prevent rotation because
7139 * rotate_ctx() will change the list from interrupt context.
7140 */
7141 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7142 parent_ctx->rotate_disable = 1;
7143 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7144
7145 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
7146 ret = inherit_task_group(event, parent, parent_ctx,
7147 child, ctxn, &inherited_all);
7148 if (ret)
7149 break;
7150 }
7151
7152 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7153 parent_ctx->rotate_disable = 0;
7154
7155 child_ctx = child->perf_event_ctxp[ctxn];
7156
7157 if (child_ctx && inherited_all) {
7158 /*
7159 * Mark the child context as a clone of the parent
7160 * context, or of whatever the parent is a clone of.
7161 *
7162 * Note that if the parent is a clone, the holding of
7163 * parent_ctx->lock avoids it from being uncloned.
7164 */
7165 cloned_ctx = parent_ctx->parent_ctx;
7166 if (cloned_ctx) {
7167 child_ctx->parent_ctx = cloned_ctx;
7168 child_ctx->parent_gen = parent_ctx->parent_gen;
7169 } else {
7170 child_ctx->parent_ctx = parent_ctx;
7171 child_ctx->parent_gen = parent_ctx->generation;
7172 }
7173 get_ctx(child_ctx->parent_ctx);
7174 }
7175
7176 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7177 mutex_unlock(&parent_ctx->mutex);
7178
7179 perf_unpin_context(parent_ctx);
7180 put_ctx(parent_ctx);
7181
7182 return ret;
7183}
7184
7185/*
7186 * Initialize the perf_event context in task_struct
7187 */
7188int perf_event_init_task(struct task_struct *child)
7189{
7190 int ctxn, ret;
7191
7192 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7193 mutex_init(&child->perf_event_mutex);
7194 INIT_LIST_HEAD(&child->perf_event_list);
7195
7196 for_each_task_context_nr(ctxn) {
7197 ret = perf_event_init_context(child, ctxn);
7198 if (ret)
7199 return ret;
7200 }
7201
7202 return 0;
7203}
7204
7205static void __init perf_event_init_all_cpus(void)
7206{
7207 struct swevent_htable *swhash;
7208 int cpu;
7209
7210 for_each_possible_cpu(cpu) {
7211 swhash = &per_cpu(swevent_htable, cpu);
7212 mutex_init(&swhash->hlist_mutex);
7213 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
7214 }
7215}
7216
7217static void __cpuinit perf_event_init_cpu(int cpu)
7218{
7219 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7220
7221 mutex_lock(&swhash->hlist_mutex);
7222 if (swhash->hlist_refcount > 0) {
7223 struct swevent_hlist *hlist;
7224
7225 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7226 WARN_ON(!hlist);
7227 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7228 }
7229 mutex_unlock(&swhash->hlist_mutex);
7230}
7231
7232#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
7233static void perf_pmu_rotate_stop(struct pmu *pmu)
7234{
7235 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7236
7237 WARN_ON(!irqs_disabled());
7238
7239 list_del_init(&cpuctx->rotation_list);
7240}
7241
7242static void __perf_event_exit_context(void *__info)
7243{
7244 struct perf_event_context *ctx = __info;
7245 struct perf_event *event, *tmp;
7246
7247 perf_pmu_rotate_stop(ctx->pmu);
7248
7249 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
7250 __perf_remove_from_context(event);
7251 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
7252 __perf_remove_from_context(event);
7253}
7254
7255static void perf_event_exit_cpu_context(int cpu)
7256{
7257 struct perf_event_context *ctx;
7258 struct pmu *pmu;
7259 int idx;
7260
7261 idx = srcu_read_lock(&pmus_srcu);
7262 list_for_each_entry_rcu(pmu, &pmus, entry) {
7263 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
7264
7265 mutex_lock(&ctx->mutex);
7266 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7267 mutex_unlock(&ctx->mutex);
7268 }
7269 srcu_read_unlock(&pmus_srcu, idx);
7270}
7271
7272static void perf_event_exit_cpu(int cpu)
7273{
7274 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7275
7276 mutex_lock(&swhash->hlist_mutex);
7277 swevent_hlist_release(swhash);
7278 mutex_unlock(&swhash->hlist_mutex);
7279
7280 perf_event_exit_cpu_context(cpu);
7281}
7282#else
7283static inline void perf_event_exit_cpu(int cpu) { }
7284#endif
7285
7286static int
7287perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7288{
7289 int cpu;
7290
7291 for_each_online_cpu(cpu)
7292 perf_event_exit_cpu(cpu);
7293
7294 return NOTIFY_OK;
7295}
7296
7297/*
7298 * Run the perf reboot notifier at the very last possible moment so that
7299 * the generic watchdog code runs as long as possible.
7300 */
7301static struct notifier_block perf_reboot_notifier = {
7302 .notifier_call = perf_reboot,
7303 .priority = INT_MIN,
7304};
7305
7306static int __cpuinit
7307perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7308{
7309 unsigned int cpu = (long)hcpu;
7310
7311 switch (action & ~CPU_TASKS_FROZEN) {
7312
7313 case CPU_UP_PREPARE:
7314 case CPU_DOWN_FAILED:
7315 perf_event_init_cpu(cpu);
7316 break;
7317
7318 case CPU_UP_CANCELED:
7319 case CPU_DOWN_PREPARE:
7320 perf_event_exit_cpu(cpu);
7321 break;
7322
7323 default:
7324 break;
7325 }
7326
7327 return NOTIFY_OK;
7328}
7329
7330void __init perf_event_init(void)
7331{
7332 int ret;
7333
7334 idr_init(&pmu_idr);
7335
7336 perf_event_init_all_cpus();
7337 init_srcu_struct(&pmus_srcu);
7338 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7339 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7340 perf_pmu_register(&perf_task_clock, NULL, -1);
7341 perf_tp_register();
7342 perf_cpu_notifier(perf_cpu_notify);
7343 register_reboot_notifier(&perf_reboot_notifier);
7344
7345 ret = init_hw_breakpoint();
7346 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
7347}
7348
7349static int __init perf_event_sysfs_init(void)
7350{
7351 struct pmu *pmu;
7352 int ret;
7353
7354 mutex_lock(&pmus_lock);
7355
7356 ret = bus_register(&pmu_bus);
7357 if (ret)
7358 goto unlock;
7359
7360 list_for_each_entry(pmu, &pmus, entry) {
7361 if (!pmu->name || pmu->type < 0)
7362 continue;
7363
7364 ret = pmu_dev_alloc(pmu);
7365 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7366 }
7367 pmu_bus_running = 1;
7368 ret = 0;
7369
7370unlock:
7371 mutex_unlock(&pmus_lock);
7372
7373 return ret;
7374}
7375device_initcall(perf_event_sysfs_init);
7376
7377#ifdef CONFIG_CGROUP_PERF
7378static struct cgroup_subsys_state *perf_cgroup_create(
7379 struct cgroup_subsys *ss, struct cgroup *cont)
7380{
7381 struct perf_cgroup *jc;
7382
7383 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7384 if (!jc)
7385 return ERR_PTR(-ENOMEM);
7386
7387 jc->info = alloc_percpu(struct perf_cgroup_info);
7388 if (!jc->info) {
7389 kfree(jc);
7390 return ERR_PTR(-ENOMEM);
7391 }
7392
7393 return &jc->css;
7394}
7395
7396static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7397 struct cgroup *cont)
7398{
7399 struct perf_cgroup *jc;
7400 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7401 struct perf_cgroup, css);
7402 free_percpu(jc->info);
7403 kfree(jc);
7404}
7405
7406static int __perf_cgroup_move(void *info)
7407{
7408 struct task_struct *task = info;
7409 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7410 return 0;
7411}
7412
7413static void perf_cgroup_move(struct task_struct *task)
7414{
7415 task_function_call(task, __perf_cgroup_move, task);
7416}
7417
7418static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7419 struct cgroup *old_cgrp, struct task_struct *task,
7420 bool threadgroup)
7421{
7422 perf_cgroup_move(task);
7423 if (threadgroup) {
7424 struct task_struct *c;
7425 rcu_read_lock();
7426 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7427 perf_cgroup_move(c);
7428 }
7429 rcu_read_unlock();
7430 }
7431}
7432
7433static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7434 struct cgroup *old_cgrp, struct task_struct *task)
7435{
7436 /*
7437 * cgroup_exit() is called in the copy_process() failure path.
7438 * Ignore this case since the task hasn't ran yet, this avoids
7439 * trying to poke a half freed task state from generic code.
7440 */
7441 if (!(task->flags & PF_EXITING))
7442 return;
7443
7444 perf_cgroup_move(task);
7445}
7446
7447struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event",
7449 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach,
7454};
7455#endif /* CONFIG_CGROUP_PERF */